Setting

Caricamento File

Si importano i sette datset

Importo primo dataset

#importazione del primo dataset e statistiche descrittive
raw1 <- read_csv2(file.path(directory, "raw_1_cli_fid.csv"),
                          na = c("NA", ""))

knitr::kable(head(raw1))
ID_CLI ID_FID ID_NEG TYP_CLI_FID COD_FID STATUS_FID DT_ACTIVE
500 814583 32 1 PREMIUM 1 2019-02-23
16647 781106 44 1 PREMIUM 1 2019-02-02
835335 816369 28 1 PREMIUM 1 2019-02-23
9557 746573 9 1 PREMIUM 1 2019-01-11
767877 741522 41 1 PREMIUM 1 2019-01-07
743090 776971 2 1 PREMIUM 1 2019-01-30
summary(raw1)
 ID_CLI           ID_FID           ID_NEG      TYP_CLI_FID    

Min. : 1 Min. : 3 Min. : 1.0 Min. :0.0000
1st Qu.:230659 1st Qu.:229067 1st Qu.: 6.0 1st Qu.:1.0000
Median :462034 Median :458969 Median :23.0 Median :1.0000
Mean :462486 Mean :459425 Mean :22.1 Mean :0.9848
3rd Qu.:693200 3rd Qu.:688435 3rd Qu.:36.0 3rd Qu.:1.0000
Max. :934919 Max. :928121 Max. :49.0 Max. :1.0000
COD_FID STATUS_FID DT_ACTIVE
Length:370135 Min. :0.00 Min. :2018-01-01
Class :character 1st Qu.:1.00 1st Qu.:2018-04-15
Mode :character Median :1.00 Median :2018-08-10
Mean :0.99 Mean :2018-08-14
3rd Qu.:1.00 3rd Qu.:2018-11-30
Max. :1.00 Max. :2019-05-11

Importo secondo dataset

#importazione del secondo dataset e statistiche descrittive
raw2 <- read_csv2(file.path(directory, "raw_2_cli_account.csv"), 
                              na = c("NA", ""))    

knitr::kable(head(raw2))  
ID_CLI EMAIL_PROVIDER W_PHONE ID_ADDRESS TYP_CLI_ACCOUNT TYP_JOB
600125 libero.it NA 584621 4 NA
729642 gmail.com NA 714144 4 NA
304639 yahoo.it 1 284176 4 NA
292497 libero.it 1 272563 4 NA
589492 gmail.com NA 573304 2 NA
638815 oulook.it 1 622947 2 NA
summary(raw2)               
##      ID_CLI       EMAIL_PROVIDER        W_PHONE        ID_ADDRESS    
##  Min.   :     1   Length:369472      Min.   :1       Min.   :     1  
##  1st Qu.:230783   Class :character   1st Qu.:1       1st Qu.:227903  
##  Median :462063   Mode  :character   Median :1       Median :456720  
##  Mean   :462541                      Mean   :1       Mean   :457283  
##  3rd Qu.:693197                      3rd Qu.:1       3rd Qu.:686533  
##  Max.   :934919                      Max.   :1       Max.   :900091  
##                                      NA's   :27305                   
##  TYP_CLI_ACCOUNT   TYP_JOB         
##  Min.   :2.000   Length:369472     
##  1st Qu.:4.000   Class :character  
##  Median :4.000   Mode  :character  
##  Mean   :3.806                     
##  3rd Qu.:4.000                     
##  Max.   :4.000                     
## 

Importo terzo dataset

#importazione del terzo dataset e statistiche descrittive
raw3 <- read_csv2(file.path(directory, "raw_3_cli_address.csv"), 
                              na = c(""))  

knitr::kable(head(raw3))    
ID_ADDRESS CAP PRV REGION
1337 20083 MI LOMBARDIA
1344 20024 MI LOMBARDIA
1347 20090 MI LOMBARDIA
1347 20090 MI LOMBARDIA
1347 20090 MI LOMBARDIA
1347 20090 MI LOMBARDIA
summary(raw3)               
##    ID_ADDRESS         CAP                PRV               REGION         
##  Min.   :     1   Length:1211332     Length:1211332     Length:1211332    
##  1st Qu.:221063   Class :character   Class :character   Class :character  
##  Median :437083   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :443391                                                           
##  3rd Qu.:664931                                                           
##  Max.   :900090

Importo quarto dataset

#importazione del quarto dataset e statistiche descrittive
raw4 <- read_csv2(file.path(directory, "raw_4_cli_privacy.csv"), 
                              na = c("NA", ""))                    

knitr::kable(head(raw4))    
ID_CLI FLAG_PRIVACY_1 FLAG_PRIVACY_2 FLAG_DIRECT_MKT
4691 1 1 1
3434 0 1 0
3533 1 1 1
9866 1 1 1
5799 1 1 1
4660 0 1 0
summary(raw4)               
##      ID_CLI       FLAG_PRIVACY_1   FLAG_PRIVACY_2   FLAG_DIRECT_MKT 
##  Min.   :     1   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:230783   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median :462063   Median :1.0000   Median :1.0000   Median :1.0000  
##  Mean   :462541   Mean   :0.6557   Mean   :0.9356   Mean   :0.6707  
##  3rd Qu.:693197   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :934919   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000

Importo quinto dataset

#importazione del quinto dataset e statistiche descrittive
raw5 <- read_csv2(file.path(directory, "raw_5_camp_cat.csv"),
                           na = c("NA", "")) 

knitr::kable(head(raw5))    
ID_CAMP TYP_CAMP CHANNEL_CAMP
757 PERSONALIZED EMAIL
759 PERSONALIZED EMAIL
760 PERSONALIZED EMAIL
761 PERSONALIZED EMAIL
762 PERSONALIZED EMAIL
763 PERSONALIZED EMAIL
summary(raw5)               
##     ID_CAMP         TYP_CAMP         CHANNEL_CAMP      
##  Min.   :   5.0   Length:848         Length:848        
##  1st Qu.: 327.8   Class :character   Class :character  
##  Median : 561.5   Mode  :character   Mode  :character  
##  Mean   : 559.6                                        
##  3rd Qu.: 812.2                                        
##  Max.   :1052.0

Importo sesto dataset

#importazione del sesto dataset e statistiche descrittive
raw6 <- read_csv2(file.path(directory, "raw_6_camp_event.csv"), 
                             na = c("NA", ""))  

knitr::kable(head(raw6))    
ID_EVENT ID_CLI ID_CAMP ID_DELIVERY TYP_EVENT EVENT_DATE
11548588 411925 948 8996 V 2019-01-01 01:42:24
11548640 313259 949 8997 V 2019-01-01 02:54:04
11548572 327299 941 8817 V 2019-01-01 01:23:29
11548515 627427 923 8263 V 2019-01-01 00:43:06
11548609 265258 950 8998 V 2019-01-01 02:11:03
11548497 693938 955 9115 V 2019-01-01 00:29:04
summary(raw6)               
##     ID_EVENT            ID_CLI          ID_CAMP        ID_DELIVERY   
##  Min.   :11548441   Min.   :     1   Min.   : 148.0   Min.   : 7680  
##  1st Qu.:13526812   1st Qu.:208129   1st Qu.: 970.0   1st Qu.:10066  
##  Median :14867410   Median :398084   Median : 991.0   Median :10585  
##  Mean   :14757544   Mean   :413793   Mean   : 928.9   Mean   :10534  
##  3rd Qu.:16284762   3rd Qu.:618440   3rd Qu.:1024.0   3rd Qu.:11151  
##  Max.   :17650340   Max.   :931973   Max.   :1048.0   Max.   :11509  
##   TYP_EVENT           EVENT_DATE                 
##  Length:2060360     Min.   :2019-01-01 00:00:46  
##  Class :character   1st Qu.:2019-02-13 13:01:06  
##  Mode  :character   Median :2019-02-28 14:01:27  
##                     Mean   :2019-03-04 14:23:02  
##                     3rd Qu.:2019-04-04 17:52:18  
##                     Max.   :2019-04-30 23:58:41

Importo settimo dataset

#importazione del settimo dataset e statistiche descrittive
raw7 <- read_csv2(file.path(directory, "raw_7_tic.csv"), 
                      na = c("NA", ""))  
                      
knitr::kable(head(raw7))    
ID_SCONTRINO ID_CLI ID_NEG ID_ARTICOLO COD_REPARTO DIREZIONE IMPORTO_LORDO SCONTO DATETIME
51299709701/05/18718488513770 318714 48 34216854 2 1 22.80 0.05 2018-05-01 07:18:48
51299709701/05/18718488513770 318714 48 34216854 2 1 22.75 0.00 2018-05-01 07:18:48
51399406901/05/18718488528430 324419 47 36167733 9 1 1290.00 491.00 2018-05-01 07:18:48
511499219301/05/18718488544998 332611 36 32404540 13 1 134.00 34.10 2018-05-01 07:18:48
511599623601/05/18718498530796 325589 35 35811412 9 1 399.00 0.00 2018-05-01 07:18:49
511599623601/05/18718498530796 325589 35 35811741 9 1 474.00 0.00 2018-05-01 07:18:49
summary(raw7)               
##  ID_SCONTRINO           ID_CLI           ID_NEG       ID_ARTICOLO      
##  Length:4263220     Min.   :     5   Min.   : 2.00   Min.   :   16921  
##  Class :character   1st Qu.:211201   1st Qu.:14.00   1st Qu.:34141800  
##  Mode  :character   Median :411821   Median :27.00   Median :35558446  
##                     Mean   :411433   Mean   :25.93   Mean   :38022591  
##                     3rd Qu.:601672   3rd Qu.:38.00   3rd Qu.:36144465  
##                     Max.   :934729   Max.   :49.00   Max.   :82231481  
##   COD_REPARTO       DIREZIONE       IMPORTO_LORDO           SCONTO         
##  Min.   : 1.000   Min.   :-1.0000   Min.   :-17465.00   Min.   :-2472.000  
##  1st Qu.: 3.000   1st Qu.: 1.0000   1st Qu.:     2.67   1st Qu.:    0.000  
##  Median : 7.000   Median : 1.0000   Median :     7.50   Median :    0.000  
##  Mean   : 7.393   Mean   : 0.8866   Mean   :    32.65   Mean   :    2.329  
##  3rd Qu.:11.000   3rd Qu.: 1.0000   3rd Qu.:    22.00   3rd Qu.:    0.000  
##  Max.   :14.000   Max.   : 1.0000   Max.   : 17465.00   Max.   : 4853.590  
##     DATETIME                  
##  Min.   :2018-05-01 07:18:48  
##  1st Qu.:2018-08-12 14:05:04  
##  Median :2018-11-09 16:37:45  
##  Mean   :2018-11-06 03:52:05  
##  3rd Qu.:2019-02-02 12:49:36  
##  Max.   :2019-04-30 21:57:25

Preparazione dei dati per l’analisi

Si puliscono i dataset e si correggono gli eventuali errori o imprecisioni per favorire un’analisi piĂ¹ accurata ed aderente alla realtĂ 

#Creo copie dei datset per avere un backup degli originali
data1 <- raw1         
data2 <- raw2         
data3  <- raw3          
data4  <- raw4           
data5 <- raw5     
data6 <- raw6        
data7  <- raw7          

Primo dataset

Pulizia

#ricerca di duplicati

data1_dup <- data1 %>%
                              summarise(TOT_ID_CLIs = n_distinct(ID_CLI),
                              TOT_ID_FIDs = n_distinct(ID_FID),           
                              TOT_ID_CLIFIDs = n_distinct(paste0(as.character(ID_CLI), "-",
                                                                 as.character(ID_FID))),
                              TOT_ROWs = n())                             

knitr::kable(data1_dup)
TOT_ID_CLIs TOT_ID_FIDs TOT_ID_CLIFIDs TOT_ROWs
369472 367925 370135 370135
#formattazzione della data e fattorizzazione degli attributi che lo richiedono
data1 <- data1                                 %>% 
                    mutate(DT_ACTIVE = as.Date(DT_ACTIVE))       %>% 
                    mutate(TYP_CLI_FID = as.factor(TYP_CLI_FID)) %>% 
                    mutate(STATUS_FID = as.factor(STATUS_FID))       
#conteggio numero abbonamenti per cliente

fid_sub_cli <- data1     %>%
                   group_by(ID_CLI) %>%                         
                   summarise(NUM_FIDs  = n_distinct(ID_FID),    
                             NUM_DATEs = n_distinct(DT_ACTIVE)) 
#conteggio percentuale numero carte fedeltĂ  per cliente

tot_id_cli <- n_distinct(fid_sub_cli$ID_CLI) 

dist_fid_sub_cli <- fid_sub_cli %>%
                        group_by(NUM_FIDs, NUM_DATEs)            %>% 
                        summarise(TOT_CLIs = n_distinct(ID_CLI)) %>% 
                        mutate(PERCENT_CLIs = TOT_CLIs/tot_id_cli)   

knitr::kable(dist_fid_sub_cli)
NUM_FIDs NUM_DATEs TOT_CLIs PERCENT_CLIs
1 1 368833 0.9982705
2 1 254 0.0006875
2 2 363 0.0009825
3 1 7 0.0000189
3 2 8 0.0000217
3 3 5 0.0000135
4 1 2 0.0000054
#clienti con 3 abbonamenti con esempi di ricerca su singoli clienti
fid_sub_cli %>%
  filter(NUM_FIDs == 3)
## # A tibble: 20 x 3
##    ID_CLI NUM_FIDs NUM_DATEs
##     <dbl>    <int>     <int>
##  1   7533        3         3
##  2  11477        3         1
##  3  68556        3         1
##  4  96537        3         1
##  5 223203        3         3
##  6 250133        3         2
##  7 311669        3         1
##  8 320880        3         2
##  9 621814        3         3
## 10 648813        3         3
## 11 662651        3         3
## 12 671097        3         2
## 13 688941        3         2
## 14 722274        3         2
## 15 736520        3         2
## 16 757759        3         2
## 17 792369        3         1
## 18 826640        3         2
## 19 859327        3         1
## 20 879261        3         1
data1 %>%
  filter(ID_CLI == 96537)
## # A tibble: 3 x 7
##   ID_CLI ID_FID ID_NEG TYP_CLI_FID COD_FID  STATUS_FID DT_ACTIVE 
##    <dbl>  <dbl>  <dbl> <fct>       <chr>    <fct>      <date>    
## 1  96537 226059     21 1           PREMIUM  0          2018-04-14
## 2  96537 226128     21 1           PREMIUM  0          2018-04-14
## 3  96537 226156     21 1           STANDARD 1          2018-04-14
data1 %>%
  filter(ID_CLI == 250133)
## # A tibble: 3 x 7
##   ID_CLI ID_FID ID_NEG TYP_CLI_FID COD_FID  STATUS_FID DT_ACTIVE 
##    <dbl>  <dbl>  <dbl> <fct>       <chr>    <fct>      <date>    
## 1 250133 158212     11 1           STANDARD 0          2018-03-15
## 2 250133 309538      1 1           PREMIUM  0          2018-05-22
## 3 250133 310499      1 1           PREMIUM  0          2018-05-22
#combinazione informazioni per ogni cliente dal primo abbonamento all'ultimo

client_data1_first <- data1                       %>%
                           group_by(ID_CLI)                    %>% 
                           filter(DT_ACTIVE == min(DT_ACTIVE)) %>% 
                           arrange(ID_FID)                     %>% 
                           filter(row_number() == 1)           %>% 
                           ungroup()                           %>% 
                           as.data.frame()

client_data1_last <- data1                       %>%
                          group_by(ID_CLI)                    %>% 
                          filter(DT_ACTIVE == max(DT_ACTIVE)) %>% 
                          arrange(desc(ID_FID))               %>% 
                          filter(row_number() == 1)           %>% 
                          ungroup()                           %>%
                          as.data.frame()
# combinazione tramite left join tra `client_data1_first` e `fid_sub_cli` in `client_data1_last`
data1_complete <- client_data1_last                      %>%
                      select(ID_CLI,
                             ID_FID,
                             LAST_COD_FID = COD_FID,
                             LAST_TYP_CLI_FID = TYP_CLI_FID,
                             LAST_STATUS_FID = STATUS_FID,
                             LAST_DT_ACTIVE = DT_ACTIVE)    %>% 
                      left_join(client_data1_first       %>% 
                                  select(ID_CLI,
                                         FIRST_ID_NEG = ID_NEG,
                                         FIRST_DT_ACTIVE = DT_ACTIVE),
                                by = "ID_CLI")              %>%
                      left_join(fid_sub_cli         %>%       
                                  select(ID_CLI,
                                         NUM_FIDs)          %>%
                                  mutate(NUM_FIDs = as.factor(NUM_FIDs)),
                                by = "ID_CLI")

Analisi esplorative

#analisi distribuzione programma fedeltĂ  tramite grafico
data1_fid <- data1_complete          %>%
  group_by(LAST_COD_FID)                   %>% 
  summarise(TOT_CLIs = n_distinct(ID_CLI)) %>% 
  mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>% 
  arrange(desc(PERCENT))                       
knitr::kable(data1_fid)
LAST_COD_FID TOT_CLIs PERCENT
STANDARD 289756 0.7842435
PREMIUM 43878 0.1187587
STANDARD BIZ 29148 0.0788910
PREMIUM BIZ 6690 0.0181069
ggplot(data = data1_fid,
       aes(x = LAST_COD_FID, y = TOT_CLIs)) + 
  geom_bar(stat = "identity",                  
           fill = "steelblue") +               
  theme_minimal()                              

#analisi distribuzione variabile `LAST_STATUS_FID` mediante grafico
data1_laststatus <- data1_complete         %>% 
  group_by(LAST_STATUS_FID)                %>%    
  summarise(TOT_CLIs = n_distinct(ID_CLI)) %>%   
  mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%   
  arrange(desc(PERCENT))                         
knitr::kable(data1_laststatus)
LAST_STATUS_FID TOT_CLIs PERCENT
1 366413 0.9917206
0 3059 0.0082794
ggplot(data = data1_laststatus, 
       aes(x = LAST_STATUS_FID, y = TOT_CLIs)) + 
  geom_bar(stat = "identity",                 
           fill = "steelblue") +                 
  theme_minimal()                                

#analisi variabile `NUM_FIDs` mediante grafico
data1_numfid <- data1_complete            %>%
  group_by(NUM_FIDs)                       %>% 
  summarise(TOT_CLIs = n_distinct(ID_CLI)) %>% 
  mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>% 
  arrange(desc(PERCENT))    
knitr::kable(data1_numfid)
NUM_FIDs TOT_CLIs PERCENT
1 368833 0.9982705
2 617 0.0016700
3 20 0.0000541
4 2 0.0000054
ggplot(data = data1_numfid,               
       aes(x = NUM_FIDs, y = TOT_CLIs)) +      
  geom_bar(stat = "identity",
           fill = "steelblue") +               
  theme_minimal()                              

#analisi variabile `LAST_TYP_CLI_FID` mediante grafico
data1_type <- data1_complete          %>%
  group_by(LAST_TYP_CLI_FID)               %>%    
  summarise(TOT_CLIs = n_distinct(ID_CLI)) %>%    
  mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%    
  arrange(desc(PERCENT))                          
knitr::kable(data1_type)
LAST_TYP_CLI_FID TOT_CLIs PERCENT
1 363865 0.9848243
0 5607 0.0151757
ggplot(data = data1_type,
       aes(x = LAST_TYP_CLI_FID, y = TOT_CLIs)) + 
  geom_bar(stat = "identity",
           fill = "steelblue") +                  
  theme_minimal()                                 

#analisi variabile `FIRST_DT_ACTIVE` mediante grafico
data1_date <- data1_complete           %>%
  group_by(FIRST_DT_ACTIVE)                %>%   
  summarise(TOT_CLIs = n_distinct(ID_CLI)) %>%   
  mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>%   
  arrange(desc(PERCENT))
knitr::kable(head(data1_date))
FIRST_DT_ACTIVE TOT_CLIs PERCENT
2018-11-23 3021 0.0081765
2018-04-07 1457 0.0039435
2018-11-22 1438 0.0038920
2018-03-11 1437 0.0038893
2018-04-28 1400 0.0037892
2018-04-14 1399 0.0037865
ggplot(data = data1_date,
       aes(x = FIRST_DT_ACTIVE, y = TOT_CLIs)) + 
  geom_line() +                                  
  theme_minimal()                                

Secondo dataset

Pulizia

#controllo duplicati che risultano non esserci
data2_dup <- data2 %>%
                            summarise(TOT_ID_CLIs = n_distinct(ID_CLI),
                                      TOT_ROWs = n())
                          
knitr::kable(data2_dup) 
TOT_ID_CLIs TOT_ROWs
369472 369472
#fattorizzazione delle variabili che lo richiedono e gestione dei valori mancanti
data2_complete <- data2 %>%
                  mutate(W_PHONE = as.factor(W_PHONE))                  %>% 
                  mutate(TYP_CLI_ACCOUNT = as.factor(TYP_CLI_ACCOUNT))  %>%  
                  mutate(W_PHONE = fct_explicit_na(W_PHONE, "0"))       %>% 
                  mutate(EMAIL_PROVIDER = fct_explicit_na(EMAIL_PROVIDER,
                                                          "(missing)")) %>% 
                  mutate(TYP_JOB = fct_explicit_na(TYP_JOB, "(missing)"))   

Reshaping

#analisi distribuzione variabile `EMAIL_PROVIDER`

data2_emailprovider <- data2_complete                             %>%
                            group_by(EMAIL_PROVIDER)                 %>% 
                            summarise(TOT_CLIs = n_distinct(ID_CLI)) %>% 
                            mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>% 
                            arrange(desc(PERCENT))                   %>% 
                            as.data.frame()

knitr::kable(head(data2_emailprovider))
EMAIL_PROVIDER TOT_CLIs PERCENT
gmail.com 151508 0.4100663
libero.it 57782 0.1563907
hotmail.it 28698 0.0776730
alice.it 18127 0.0490619
yahoo.it 16538 0.0447612
hotmail.com 10076 0.0272713
#fattorizzazione della variabile `EMAIL_PROVIDER` considerati i molti livelli in cui la frequenza è molto vicina allo zero. Si decide di accorpare qualsiasi livello che presenti una frequenza percentuale inferiore a 0.85 nel livello denominato `others`

knitr::kable(data2_emailprovider                      %>%
  arrange(desc(PERCENT))                                   %>% 
  mutate(PERCENT_COVERED = cumsum(TOT_CLIs)/sum(TOT_CLIs)) %>% 
  as.data.frame()                                          %>%
  head(10))
EMAIL_PROVIDER TOT_CLIs PERCENT PERCENT_COVERED
gmail.com 151508 0.4100663 0.4100663
libero.it 57782 0.1563907 0.5664570
hotmail.it 28698 0.0776730 0.6441300
alice.it 18127 0.0490619 0.6931919
yahoo.it 16538 0.0447612 0.7379531
hotmail.com 10076 0.0272713 0.7652244
virgilio.it 9161 0.0247948 0.7900193
tiscali.it 8733 0.0236364 0.8136557
live.it 7936 0.0214793 0.8351350
(missing) 5889 0.0159390 0.8510740
emailprovider <- data2_emailprovider %>%
                          arrange(desc(PERCENT)) %>%                                       
                          mutate(PERCENT_COVERED = cumsum(TOT_CLIs)/sum(TOT_CLIs)) %>%      
                          mutate(EMAIL_PROVIDER = as.character(EMAIL_PROVIDER))    %>%
                          mutate(AUX = if_else(PERCENT_COVERED < 0.85 |
                                                 (PERCENT_COVERED > 0.85 &
                                                    lag(PERCENT_COVERED) < 0.85), 1,0)) %>%
                          mutate(EMAIL_PROVIDER_CLEAN = if_else(AUX | EMAIL_PROVIDER == "(missing)",
                                                                EMAIL_PROVIDER,
                                                                "others"))                  

knitr::kable(head(emailprovider, 7))
EMAIL_PROVIDER TOT_CLIs PERCENT PERCENT_COVERED AUX EMAIL_PROVIDER_CLEAN
gmail.com 151508 0.4100663 0.4100663 1 gmail.com
libero.it 57782 0.1563907 0.5664570 1 libero.it
hotmail.it 28698 0.0776730 0.6441300 1 hotmail.it
alice.it 18127 0.0490619 0.6931919 1 alice.it
yahoo.it 16538 0.0447612 0.7379531 1 yahoo.it
hotmail.com 10076 0.0272713 0.7652244 1 hotmail.com
virgilio.it 9161 0.0247948 0.7900193 1 virgilio.it
data2_complete <- data2_complete                         %>%
  mutate(EMAIL_PROVIDER = as.character(EMAIL_PROVIDER)) %>% 
  left_join(emailprovider                       %>%
              select(EMAIL_PROVIDER, EMAIL_PROVIDER_CLEAN),
            by = "EMAIL_PROVIDER")                      %>% 
  select(-EMAIL_PROVIDER)                               %>%
  mutate(EMAIL_PROVIDER_CLEAN = as.factor(EMAIL_PROVIDER_CLEAN)) 

Analisi esplorative

#analisi distribuzionen variabile `W_PHONE` mediante grafico
data2_phone <- data2_complete %>%
  group_by(W_PHONE)                        %>% 
  summarise(TOT_CLIs = n_distinct(ID_CLI)) %>% 
  mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>% 
  arrange(desc(PERCENT))                   %>% 
  as.data.frame()

knitr::kable(data2_phone)
W_PHONE TOT_CLIs PERCENT
1 342167 0.9260972
0 27305 0.0739028
ggplot(data = data2_phone,
       aes(x = W_PHONE,
           y = TOT_CLIs)) +                   
  geom_bar(stat = "identity",
           fill = "steelblue") +               
  theme_minimal()                              

#analisi distribuzione variabile `TYP_CLI_ACCOUNT`
data2_account <- data2_complete         %>%
  group_by(TYP_CLI_ACCOUNT)                %>% 
  summarise(TOT_CLIs = n_distinct(ID_CLI)) %>% 
  mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>% 
  arrange(desc(PERCENT))                   %>% 
  as.data.frame()

knitr::kable(data2_account)
TYP_CLI_ACCOUNT TOT_CLIs PERCENT
4 333656 0.9030617
2 35816 0.0969383
#analisi distribuzione variabile `EMAIL_PROVIDER_CLEAN` tramite grafico
data2_emailproviderclean <- data2_complete %>%
  group_by(EMAIL_PROVIDER_CLEAN)             %>% 
  summarise(TOT_CLIs = n_distinct(ID_CLI))   %>% 
  mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs))   %>% 
  arrange(desc(PERCENT))                         

ggplot(data = data2_emailproviderclean,
       aes(x = EMAIL_PROVIDER_CLEAN,
           y = TOT_CLIs)) +                      
  geom_bar(stat = "identity",
           fill = "steelblue") +                 
  theme_minimal()                                

#analisi distribuzione variabile `TYP_JOB` mediante grafico
data2_job<- data2_complete             %>%
  group_by(TYP_JOB)                        %>% 
  summarise(TOT_CLIs = n_distinct(ID_CLI)) %>% 
  mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>% 
  arrange(desc(PERCENT))                   %>% 
  as.data.frame()

knitr::kable(data2_job)
TYP_JOB TOT_CLIs PERCENT
(missing) 360810 0.9765557
Libero professionista 3970 0.0107451
Impiegato/a 1560 0.0042222
Altro 784 0.0021219
Pensionato/a 641 0.0017349
Operaio/a 482 0.0013046
Dirigente/Quadro/Funzionario 354 0.0009581
Non Dichiara 218 0.0005900
Casalinga 191 0.0005170
Artigiano 131 0.0003546
Imprenditore 108 0.0002923
In cerca di occupazione 79 0.0002138
Commerciante/Esercente 71 0.0001922
Studente 54 0.0001462
Rappresentante/Agente Commerciale 19 0.0000514
ggplot(data = data2_job,
       aes(x = TYP_JOB,
           y = TOT_CLIs)) +                    
  geom_bar(stat = "identity", 
           fill = "steelblue") +               
  theme_minimal()                              

#analisi distribuzione variabile `TYP_CLI_ACCOUNT`
data2_account <- data2_complete         %>%
  group_by(TYP_CLI_ACCOUNT)                %>% 
  summarise(TOT_CLIs = n_distinct(ID_CLI)) %>% 
  mutate(PERCENT = TOT_CLIs/sum(TOT_CLIs)) %>% 
  arrange(desc(PERCENT))                   %>% 
  as.data.frame()

knitr::kable(data2_account)
TYP_CLI_ACCOUNT TOT_CLIs PERCENT
4 333656 0.9030617
2 35816 0.0969383

Terzo dataset

Pulizia

#controllo sui duplicati e rimozione di quelli trovati
knitr::kable(data3 %>%
               summarise(TOT_ID_ADDRESSes = n_distinct(ID_ADDRESS), 
               TOT_ROWs = n()))                                     
TOT_ID_ADDRESSes TOT_ROWs
361330 1211332
data3_complete <- data3 %>%
                  distinct()
#fattorizzazione della variabile `CAP` e rimozione delle righe che presentano un valore mancante su almeno uno dei seguenti attributi: (`CAP`), (`PRV`) e (`REGION`)

data3_complete <- data3_complete %>%
                    mutate(CAP = as.character(CAP))                   %>% 
                    filter(!is.na(CAP) & !is.na(PRV) & !is.na(REGION))

Analisi esplorative

#analisi della variabile `CAP` in cui sono presenti 4784 differenti valori
nrow(data3_complete %>%
  distinct(CAP))       
## [1] 4784
#data l'elevata numerositĂ  di valori unici nella variabile `CAP` si decide di analizzare la variabile `PRV` che presenta 110 differenti valori

nrow(data3_complete %>%
  distinct(PRV))
## [1] 110
#si ritiene troppo elevata anche la numerositĂ  di valori unici della variabile `PRV`, si decide di concentrarsi sulla variabile `REGION` che presente 20 valori differenti. Si analizza la distribuzione della variabile `REGION` mediante grafico

nrow(data3_complete %>%
  distinct(REGION))
## [1] 20
data3_region<- data3_complete                  %>%
  group_by(REGION)                                %>%  
  summarise(TOT_ADDRESS = n_distinct(ID_ADDRESS)) %>%  
  mutate(PERCENT = TOT_ADDRESS/sum(TOT_ADDRESS))  %>%  
  arrange(desc(PERCENT))                               

knitr::kable(head(data3_region))
REGION TOT_ADDRESS PERCENT
LOMBARDIA 97181 0.2879710
LAZIO 32058 0.0949957
CAMPANIA 30570 0.0905864
VENETO 29696 0.0879965
SICILIA 28329 0.0839457
PIEMONTE 24377 0.0722350
ggplot(data = data3_region, aes(x = REGION,
                                   y = TOT_ADDRESS)) + 
  geom_bar(stat = "identity",
           fill = "steelblue") +                       
  theme_minimal()                                      

Quarto dataset

Pulizia

#controllo sui duplicati che risultano non essere presenti

knitr::kable(data4 %>%
               summarise(TOT_ID_CLIs = n_distinct(ID_CLI), 
               TOT_ROWs = n()))                            
TOT_ID_CLIs TOT_ROWs
369472 369472
#fattorizzazione per le variabili che lo richiedono
data4_complete <- data4                      %>%
  mutate(FLAG_PRIVACY_1 = as.factor(FLAG_PRIVACY_1)) %>% 
  mutate(FLAG_PRIVACY_2 = as.factor(FLAG_PRIVACY_2)) %>% 
  mutate(FLAG_DIRECT_MKT = as.factor(FLAG_DIRECT_MKT))   

Analisi esplorative

#analisi distribuzione variabile `FLAG_PRIVACY_1` mediante grafico
data4_privacy1<- data4_complete                            %>%
                      group_by(FLAG_PRIVACY_1)               %>% 
                      summarise(TOT_ID = n_distinct(ID_CLI)) %>% 
                      mutate(PERCENT = TOT_ID/sum(TOT_ID))   %>% 
                      arrange(desc(PERCENT))                     

knitr::kable(data4_privacy1)
FLAG_PRIVACY_1 TOT_ID PERCENT
1 242251 0.6556681
0 127221 0.3443319
ggplot(data = data4_privacy1,
       aes(x = FLAG_PRIVACY_1,
           y = TOT_ID)) +       
  geom_bar(stat = "identity",
           fill = "steelblue") + 
  theme_minimal()                

#analisi distribuzione varibile `FLAG_PRIVACY_2` mediante grafico

data4_privacy2 <- data4_complete                            %>%
                      group_by(FLAG_PRIVACY_2)                %>% 
                      summarise(TOT_ID = n_distinct(ID_CLI))  %>% 
                      mutate(PERCENT = TOT_ID/sum(TOT_ID))    %>% 
                      arrange(desc(PERCENT))                       

knitr::kable(data4_privacy2)
FLAG_PRIVACY_2 TOT_ID PERCENT
1 345682 0.9356108
0 23790 0.0643892
ggplot(data = data4_privacy2,
       aes(x = FLAG_PRIVACY_2,
           y = TOT_ID)) +        
  geom_bar(stat = "identity",
           fill = "steelblue") + 
  theme_minimal()                

#analisi distribuzione varibile `FLAG_DIRECT_MKT` mediante grafico
data4_privacy_mkt <- data4_complete                             %>%
                          group_by(FLAG_DIRECT_MKT)               %>% 
                          summarise(TOT_ID = n_distinct(ID_CLI))  %>% 
                          mutate(PERCENT = TOT_ID/sum(TOT_ID))    %>% 
                          arrange(desc(PERCENT))                        

knitr::kable(data4_privacy_mkt)
FLAG_DIRECT_MKT TOT_ID PERCENT
1 247790 0.6706598
0 121682 0.3293402
ggplot(data = data4_privacy_mkt,
       aes(x = FLAG_DIRECT_MKT,
           y = TOT_ID)) +        
  geom_bar(stat = "identity",
           fill = "steelblue") + 
  theme_minimal()                

Quinto dataset

#rimozione variabile `CHANNEL_CAMP` in quanto ritenuta non discriminante per l'analisi
data5_complete <- data5 %>%
                             select(-CHANNEL_CAMP)

Sesto dataset

Pulizia

#formattazione delle date
data6_complete <- data6                                                    %>%
                              mutate(EVENT_DATETIME = as.POSIXct(EVENT_DATE,
                                                                 format = "%Y-%m-%dT%H:%M:%S")) %>% 
                              mutate(EVENT_HOUR = hour(EVENT_DATETIME))                         %>% 
                              mutate(EVENT_DATE = as.Date(EVENT_DATETIME))                    

Reshaping

#fattorizzazione della variabile `TYP_EVENT` unendo i livelli `E` e `B` insieme al livello `F`
data6_complete<- data6_complete %>%
                          mutate(TYP_EVENT = as.factor(if_else(TYP_EVENT == "E" | TYP_EVENT == "B",
                                                               "F", as.character(TYP_EVENT)))) %>%
                          left_join(data5_complete,
                                    by = "ID_CAMP")                                                 
#aggiunta a ciascun evento di invio le aperture, i click o i fallimenti corrispondenti
d6_sends <- data6_complete            %>%
              filter(TYP_EVENT == "S")       %>% 
              select(-TYP_EVENT)             %>%
              select(ID_EVENT_S = ID_EVENT,
                     ID_CLI,
                     ID_CAMP,
                     TYP_CAMP,
                     ID_DELIVERY,
                     SEND_DATE = EVENT_DATE) %>% 
              as.data.frame()
#si contano gli eventi aperti cercando di non contare la stessa apertura due volte, per questo si conta solo la prima
d6_opens_parz <- data6_complete             %>%
                    filter(TYP_EVENT == "V")       %>% 
                    select(-TYP_EVENT)             %>%
                    select(ID_EVENT_O = ID_EVENT,
                           ID_CLI,
                           ID_CAMP,
                           TYP_CAMP,
                           ID_DELIVERY,
                           OPEN_DATETIME = EVENT_DATETIME,
                           OPEN_DATE = EVENT_DATE) %>% 
                    as.data.frame()

d6_total_opens <- d6_opens_parz                                  %>%
                  group_by(ID_CLI,
                           ID_CAMP,
                           ID_DELIVERY)                       %>% 
                  summarise(NUM_OPENs = n_distinct(ID_EVENT_O))   
                
d6_opens <- d6_opens_parz                                 %>%
              left_join(d6_total_opens,
                        by = c("ID_CLI",
                               "ID_CAMP",
                               "ID_DELIVERY"))            %>% 
              group_by(ID_CLI,
                       ID_CAMP,
                       ID_DELIVERY)                       %>% 
              filter(OPEN_DATETIME == min(OPEN_DATETIME)) %>% 
              filter(row_number() == 1)                   %>%
              ungroup()                                   %>%
              as.data.frame()
#si contano i click nelle comunicazioni cercando di non contare piĂ¹ di un click, per questo si conta solo il primo
d6_clicks_parz <- data6_complete                              %>%
                      filter(TYP_EVENT == "C")                      %>%
                      select(-TYP_EVENT)                            %>%
                      select(ID_EVENT_C = ID_EVENT,
                             ID_CLI,
                             ID_CAMP,
                             TYP_CAMP,
                             ID_DELIVERY,
                             CLICK_DATETIME = EVENT_DATETIME,
                             CLICK_DATE = EVENT_DATE)                   
                    
d6_total_clicks <- d6_clicks_parz                                    %>%
                    group_by(ID_CLI,
                             ID_CAMP,
                             ID_DELIVERY)                         %>% 
                    summarise(NUM_CLICKs = n_distinct(ID_EVENT_C))    

d6_clicks <- d6_clicks_parz                                   %>%
                left_join(d6_total_clicks,
                          by = c("ID_CLI",
                                 "ID_CAMP",
                                 "ID_DELIVERY"))              %>% 
                group_by(ID_CLI,
                         ID_CAMP,
                         ID_DELIVERY)                         %>% 
                filter(CLICK_DATETIME == min(CLICK_DATETIME)) %>% 
                filter(row_number() == 1) %>%
                ungroup()                                     %>%
                as.data.frame()
#si contano i fallimenti
d6_fails <- data6_complete                          %>%
              filter(TYP_EVENT == "F")                    %>% 
              select(-TYP_EVENT)                          %>%
              select(ID_EVENT_F = ID_EVENT,
                     ID_CLI,
                     ID_CAMP,
                     TYP_CAMP,
                     ID_DELIVERY,
                     FAIL_DATETIME = EVENT_DATETIME,
                     FAIL_DATE = EVENT_DATE)              %>% 
              group_by(ID_CLI,
                       ID_CAMP,
                       ID_DELIVERY)                       %>% 
              filter(FAIL_DATETIME == min(FAIL_DATETIME)) %>% 
              filter(row_number() == 1)                   %>%
              ungroup()                                   %>%
              as.data.frame()
#si combinano tutte le informazioni ricavate precedentemente sugli eventi in un'unica variabile
data6_complete_finale <- d6_sends                     %>%
                                left_join(d6_opens,
                                          by = c("ID_CLI",
                                                 "ID_CAMP",
                                                 "ID_DELIVERY",
                                                 "TYP_CAMP"))                            %>% 
                                filter(is.na(OPEN_DATE) | SEND_DATE <= OPEN_DATE)        %>%
                                left_join(d6_clicks,
                                          by = c("ID_CLI",
                                                 "ID_CAMP",
                                                 "ID_DELIVERY",
                                                 "TYP_CAMP"))                            %>% 
                                filter(is.na(CLICK_DATE) | OPEN_DATE <= CLICK_DATE)      %>%
                                left_join(d6_fails,
                                          by = c("ID_CLI",
                                                 "ID_CAMP",
                                                 "ID_DELIVERY",
                                                 "TYP_CAMP"))                            %>% 
                                filter(is.na(FAIL_DATE) | SEND_DATE <= FAIL_DATE)        %>% 
                                mutate(OPENED = !is.na(ID_EVENT_O))                      %>% 
                                mutate(CLICKED = !is.na(ID_EVENT_C))                     %>% 
                                mutate(FAILED = !is.na(ID_EVENT_F))                      %>% 
                                mutate(DAYS_TO_OPEN = as.integer(OPEN_DATE - SEND_DATE)) %>%
                                select(ID_EVENT_S,
                                       ID_CLI,
                                       ID_CAMP,
                                       TYP_CAMP,
                                       ID_DELIVERY,
                                       SEND_DATE,
                                       OPENED,
                                       OPEN_DATE,
                                       DAYS_TO_OPEN,
                                       NUM_OPENs,
                                       CLICKED,
                                       CLICK_DATE,
                                       NUM_CLICKs,
                                       FAILED)                                                

Analisi esplorative

#analisi panoramica del dataset
d6_overview <- data6_complete_finale %>% 
                  summarise(MIN_DATE = min(SEND_DATE),           
                            MAX_DATE = max(SEND_DATE),           
                            TOT_EVENTs = n_distinct(ID_EVENT_S), 
                            TOT_CLIs = n_distinct(ID_CLI))       

knitr::kable(d6_overview)
MIN_DATE MAX_DATE TOT_EVENTs TOT_CLIs
2019-01-03 2019-04-30 1556646 190427
#analisi distribuzione variabile `TYP_CAMP` tramite grafico
data6_typ_camp <- data6_complete_finale %>%
                        group_by(TYP_CAMP)      %>%                    
                        summarise(MIN_DATE = min(SEND_DATE),           
                                  MAX_DATE = max(SEND_DATE),           
                                  TOT_EVENTs = n_distinct(ID_EVENT_S), 
                                  TOT_CLIs = n_distinct(ID_CLI))      

knitr::kable(data6_typ_camp)
TYP_CAMP MIN_DATE MAX_DATE TOT_EVENTs TOT_CLIs
LOCAL 2019-02-02 2019-04-02 151719 87894
NATIONAL 2019-01-07 2019-04-23 833085 177153
PERSONALIZED 2019-01-03 2019-04-30 194840 133908
PRODUCT 2019-01-03 2019-04-25 377002 69724
ggplot(data = data6_typ_camp,
       aes(x = TYP_CAMP,
           y = TOT_EVENTs)) +    
  geom_bar(stat = "identity",
           fill = "steelblue") + 
  theme_minimal()                

#analisi distribuzione variabile `OPENED` mediante grafico
data6_opened <- data6_complete_finale                        %>%
                      group_by(OPENED)                               %>%
                      summarise(TOT_EVENTs = n_distinct(ID_EVENT_S),              
                                TOT_CLIs = n_distinct(ID_CLI))       %>%          
                      mutate(TYP_CAMP = 'ALL')                       %>%
                      mutate(PERCENT_EVENTs = TOT_EVENTs/d6_overview$TOT_EVENTs, 
                             PERCENT_CLIs = TOT_CLIs/d6_overview$TOT_CLIs)       

knitr::kable(data6_opened)
OPENED TOT_EVENTs TOT_CLIs TYP_CAMP PERCENT_EVENTs PERCENT_CLIs
FALSE 1278264 178378 ALL 0.8211655 0.9367264
TRUE 278382 83420 ALL 0.1788345 0.4380681
ggplot(data = data6_opened,
         aes(fill = OPENED,
             x = TYP_CAMP,
             y = TOT_EVENTs)) +  
    geom_bar(stat = "identity",
             position = "fill") +
    theme_minimal()               

#analisi distribuzione della variabie riguardante le mail degli eventi aperte, tenendo in considerazione le tipologie della campagna, mediante grafici che mostrano in valore assoluto e in percentuale
data6_openedbytyp <- data6_complete_finale                         %>%
                          group_by(TYP_CAMP, OPENED)                       %>% 
                          summarise(TOT_EVENTs = n_distinct(ID_EVENT_S),
                                    TOT_CLIs = n_distinct(ID_CLI))         %>% 
                          left_join(data6_typ_camp %>%
                                      select(TYP_CAMP,
                                             ALL_TOT_EVENTs = TOT_EVENTs,
                                             ALL_TOT_CLIs = TOT_CLIs),
                                    by = 'TYP_CAMP')                       %>% 
                          mutate(PERCENT_EVENTs = TOT_EVENTs/ALL_TOT_EVENTs,
                                 PERCENT_CLIs = TOT_CLIs/ALL_TOT_CLIs)     %>% 
                          select(TYP_CAMP,
                                 OPENED,
                                 TOT_EVENTs,
                                 TOT_CLIs,
                                 PERCENT_EVENTs,
                                 PERCENT_CLIs)                                 

knitr::kable(data6_openedbytyp)
TYP_CAMP OPENED TOT_EVENTs TOT_CLIs PERCENT_EVENTs PERCENT_CLIs
LOCAL FALSE 126700 76835 0.8350965 0.8741780
LOCAL TRUE 25019 18029 0.1649035 0.2051221
NATIONAL FALSE 710721 162049 0.8531194 0.9147404
NATIONAL TRUE 122364 62964 0.1468806 0.3554216
PERSONALIZED FALSE 156431 111942 0.8028690 0.8359620
PERSONALIZED TRUE 38409 31327 0.1971310 0.2339442
PRODUCT FALSE 284412 62356 0.7544045 0.8943262
PRODUCT TRUE 92590 29605 0.2455955 0.4246027
ggplot(data = data6_openedbytyp,
       aes(fill = OPENED,
           x = TYP_CAMP,
           y = TOT_EVENTs)) +      
  geom_bar(stat = "identity") +     
  theme_minimal() +               
  labs(title="APERTURA MAIL PER TIPOLOGIA DI CAMPAGNA")

ggplot(data = data6_openedbytyp,
       aes(fill = OPENED,
           x = TYP_CAMP,
           y = TOT_EVENTs)) +      
  geom_bar(position = "fill",
           stat = "identity") +     
  theme_minimal()                   

#analisi distribuzione della variabile `DAYS_TO_OPEN` mediante grafico
data6_daystoopen <- data6_complete_finale   %>%
                          filter(OPENED)        %>%
                          group_by(ID_CLI)     %>% 
                          summarise(AVG_DAYS_TO_OPEN = floor(mean(DAYS_TO_OPEN))) %>% 
                          ungroup()                 %>%
                          group_by(AVG_DAYS_TO_OPEN)         %>% 
                          summarise(TOT_CLIs = n_distinct(ID_CLI)) 

knitr::kable(head(data6_daystoopen))
AVG_DAYS_TO_OPEN TOT_CLIs
0 53094
1 12383
2 4906
3 3066
4 2147
5 1459
ggplot(data = data6_daystoopen %>%
         filter(AVG_DAYS_TO_OPEN < 14),
       aes(x = AVG_DAYS_TO_OPEN,
           y = TOT_CLIs)) +             
  geom_bar(stat = "identity",
           fill = "steelblue") +        
  theme_minimal()                       

data6_daystoopen_vs_cumulate <- data6_daystoopen         %>%
                                      arrange(AVG_DAYS_TO_OPEN) %>%
                                      mutate(PERCENT_COVERED = cumsum(TOT_CLIs)/sum(TOT_CLIs))


ggplot(data = data6_daystoopen_vs_cumulate %>%
         filter(AVG_DAYS_TO_OPEN < 14),
       aes(x = AVG_DAYS_TO_OPEN,
           y = PERCENT_COVERED)) +           
  geom_line() +
  geom_point() +                             
  scale_x_continuous(breaks = seq(0, 14, 2),
                     minor_breaks = 0:14) +  
  theme_minimal() +  
  labs(title="DISTRIBUZIONE GIORNI PRIMA DELL'APERTURA DELLA MAIL")

#analisi distribuzione della variabile `CLICKED`, in relazione alla tipologia di campagna, mediante grafici in valore assoluto e in percentuale
data6_clickedbytyp <- data6_complete_finale                      %>%
                            group_by(TYP_CAMP,
                                     CLICKED)                            %>% 
                            summarise(TOT_EVENTs = n_distinct(ID_EVENT_S),
                                      TOT_CLIs = n_distinct(ID_CLI))     %>% 
                            left_join(data6_typ_camp                  %>%
                                        select(TYP_CAMP,
                                               ALL_TOT_EVENTs = TOT_EVENTs,
                                               ALL_TOT_CLIs = TOT_CLIs),
                                      by='TYP_CAMP')                     %>% 
                            mutate(PERCENT_EVENTs = TOT_EVENTs/ALL_TOT_EVENTs,
                                   PERCENT_CLIs = TOT_CLIs/ALL_TOT_CLIs) %>% 
                            select(TYP_CAMP,
                                   CLICKED,
                                   TOT_EVENTs,
                                   TOT_CLIs,
                                   PERCENT_EVENTs,
                                   PERCENT_CLIs)                             

knitr::kable(data6_clickedbytyp)
TYP_CAMP CLICKED TOT_EVENTs TOT_CLIs PERCENT_EVENTs PERCENT_CLIs
LOCAL FALSE 150374 87332 0.9911349 0.9936059
LOCAL TRUE 1345 1280 0.0088651 0.0145630
NATIONAL FALSE 815796 175939 0.9792470 0.9931472
NATIONAL TRUE 17289 14216 0.0207530 0.0802470
PERSONALIZED FALSE 192741 133043 0.9892271 0.9935403
PERSONALIZED TRUE 2099 2016 0.0107729 0.0150551
PRODUCT FALSE 362551 69373 0.9616686 0.9949659
PRODUCT TRUE 14451 8800 0.0383314 0.1262119
ggplot(data = data6_clickedbytyp,
       aes(fill = CLICKED,
           x = TYP_CAMP,
           y = TOT_EVENTs)) +   
  geom_bar(stat = "identity") + 
  theme_minimal()               

ggplot(data = data6_clickedbytyp,
       aes(fill = CLICKED,
             x = TYP_CAMP,
             y = TOT_EVENTs)) + 
  geom_bar(position = "fill",
           stat = "identity") + 
  theme_minimal()               

#analisi distribuzione della variabile `FAILED`, in relazione alla tipologia di campagna, mediante grafici in valore assoluto e in percentuale 
data6_failedbytyp <- data6_complete_finale                     %>%
                          group_by(TYP_CAMP,
                                   FAILED)                             %>% 
                          summarise(TOT_EVENTs = n_distinct(ID_EVENT_S),
                                    TOT_CLIs = n_distinct(ID_CLI))     %>% 
                          left_join(data6_typ_camp                  %>%
                                      select(TYP_CAMP,
                                             ALL_TOT_EVENTs = TOT_EVENTs,
                                             ALL_TOT_CLIs = TOT_CLIs),
                                    by = 'TYP_CAMP')                   %>% 
                          mutate(PERCENT_EVENTs = TOT_EVENTs/ALL_TOT_EVENTs,
                                 PERCENT_CLIs = TOT_CLIs/ALL_TOT_CLIs) %>% #-- Percentuale
                          select(TYP_CAMP,
                                 FAILED,
                                 TOT_EVENTs,
                                 TOT_CLIs,
                                 PERCENT_EVENTs,
                                 PERCENT_CLIs)                             

knitr::kable(data6_failedbytyp)
TYP_CAMP FAILED TOT_EVENTs TOT_CLIs PERCENT_EVENTs PERCENT_CLIs
LOCAL FALSE 149189 86540 0.9833244 0.9845951
LOCAL TRUE 2530 1653 0.0166756 0.0188067
NATIONAL FALSE 814897 174425 0.9781679 0.9846009
NATIONAL TRUE 18188 7045 0.0218321 0.0397679
PERSONALIZED FALSE 192844 132774 0.9897557 0.9915315
PERSONALIZED TRUE 1996 1690 0.0102443 0.0126206
PRODUCT FALSE 371788 68960 0.9861698 0.9890425
PRODUCT TRUE 5214 1729 0.0138302 0.0247978
ggplot(data = data6_failedbytyp,
       aes(fill = FAILED,
           x = TYP_CAMP,
           y = TOT_EVENTs)) +  
  geom_bar(stat = "identity") + 
  theme_minimal()               

ggplot(data = data6_failedbytyp,
       aes(fill = FAILED,
           x = TYP_CAMP,
           y = TOT_EVENTs)) +   
  geom_bar(position = "fill",   
           stat = "identity") + 
  theme_minimal()               

#analisi distribuzione della variabile `NUM_OPENs` mediante grafico
data6_numopens <- data6_complete_finale                  %>%
                      group_by(NUM_OPENs)                        %>% 
                      summarise(TOT_ID = n_distinct(ID_EVENT_S)) %>% 
                      mutate(PERCENT = TOT_ID/sum(TOT_ID))       %>% 
                      arrange(desc(PERCENT))                         

knitr::kable(head(data6_numopens))
NUM_OPENs TOT_ID PERCENT
NA 1278264 0.8211655
1 220900 0.1419077
2 37877 0.0243324
3 11058 0.0071037
4 4002 0.0025709
5 1833 0.0011775
ggplot(data = data6_numopens,
       aes(x = NUM_OPENs,
           y = TOT_ID)) +       
  geom_bar(stat = "identity",
           fill = "steelblue") + 
  xlim(0, 15) +                  
  theme_minimal()                

#analisi distribuzione della variabile `NUM_CLICKs` mediante grafico
data6_numclicks <- data6_complete_finale                   %>%
                        group_by(NUM_CLICKs)                       %>% 
                        summarise(TOT_ID = n_distinct(ID_EVENT_S)) %>% 
                        mutate(PERCENT = TOT_ID/sum(TOT_ID))       %>% 
                        arrange(desc(PERCENT))                         

knitr::kable(head(data6_numclicks))
NUM_CLICKs TOT_ID PERCENT
NA 1521462 0.9773976
1 26912 0.0172885
2 5586 0.0035885
3 1604 0.0010304
4 593 0.0003809
5 263 0.0001690
ggplot(data = data6_numclicks,
       aes(x = NUM_CLICKs,
           y = TOT_ID)) +        
  geom_bar(stat = "identity",
           fill = "steelblue") + 
  xlim(0, 15) +                  
  theme_minimal()                

Settimo dataset

Pulizia

#formattazione della data e fattorizzazione della variabile `DIREZIONE`
data7_complete <- data7 %>%
                    mutate(TIC_DATETIME = as.POSIXct(DATETIME,
                                                     format = "%Y-%m-%dT%H%M%S")) %>% 
                    mutate(TIC_HOUR = hour(TIC_DATETIME))                         %>% 
                    mutate(TIC_DATE = as.Date(TIC_DATETIME))                      %>% 
                    select(-DATETIME)                                             %>% 
                    mutate(DIREZIONE = as.factor(DIREZIONE))                      %>% 
                    mutate(COD_REPARTO = as.factor(COD_REPARTO))                      

Reshaping

#creazione di features per differenziare tipologie di giorno
data7_complete <- data7_complete %>%
                    mutate(TIC_DATE_WEEKDAY = wday(TIC_DATE))               %>% 
                    mutate(TIC_DATE_HOLIDAY = isHoliday("Italy", TIC_DATE)) %>% 
                    mutate(TIC_DATE_TYP = case_when(
                      (TIC_DATE_WEEKDAY %in% c(6,7)) ~ "weekend",
                      (TIC_DATE_HOLIDAY == TRUE) ~ "holiday",
                      (TIC_DATE_WEEKDAY < 7) ~ "weekday",
                      TRUE ~ "other"))

Analisi esplorative

#analisi della variabile `TIC_DATE`
d7_panoramica <- data7_complete %>% 
                  summarise(MIN_DATE = min(TIC_DATE),           
                            MAX_DATE = max(TIC_DATE),            
                            TOT_TICs = n_distinct(ID_SCONTRINO), 
                            TOT_CLIs = n_distinct(ID_CLI))       

knitr::kable(d7_panoramica)
MIN_DATE MAX_DATE TOT_TICs TOT_CLIs
2018-05-01 2019-04-30 998035 212124
#analisi distribuzione della variabile `DIREZIONE` di cui quasi il 91% dei casi sono acquisti
data7_direction <- data7_complete %>%
                        group_by(DIREZIONE)                            %>%    
                        summarise(TOT_TICs = n_distinct(ID_SCONTRINO),
                                  TOT_CLIs = n_distinct(ID_CLI))       %>%    
                        mutate(PERCENT_TICs = TOT_TICs/d7_panoramica$TOT_TICs, 
                               PERCENT_CLIs = TOT_CLIs/d7_panoramica$TOT_CLIs)

knitr::kable(data7_direction)
DIREZIONE TOT_TICs TOT_CLIs PERCENT_TICs PERCENT_CLIs
-1 90189 46622 0.0903666 0.2197865
1 907846 212124 0.9096334 1.0000000
#analisi distribuzione della variabile `DIREZIONE` secondo la variabile `TIC_HOURS` mediante grafico in numeri assoluti e in percentuale
data7_hour <- data7_complete %>%
                    group_by(TIC_HOUR, DIREZIONE)                  %>% 
                    summarise(TOT_TICs = n_distinct(ID_SCONTRINO),
                              TOT_CLIs = n_distinct(ID_CLI))       %>% 
                    left_join(data7_direction                   %>% 
                                select(DIREZIONE,
                                       ALL_TOT_TICs = TOT_TICs,
                                       ALL_TOT_CLIs = TOT_CLIs),
                              by = 'DIREZIONE')                    %>%
                    mutate(PERCENT_TICs = TOT_TICs/ALL_TOT_TICs,
                           PERCENT_CLIs = TOT_CLIs/ALL_TOT_CLIs)   %>% 
                    select(-ALL_TOT_TICs,
                           -ALL_TOT_CLIs)

knitr::kable(head(data7_hour))
TIC_HOUR DIREZIONE TOT_TICs TOT_CLIs PERCENT_TICs PERCENT_CLIs
3 1 2 2 0.0000022 0.0000094
4 1 4 4 0.0000044 0.0000189
5 1 2 2 0.0000022 0.0000094
6 1 32 32 0.0000352 0.0001509
7 -1 759 638 0.0084157 0.0136845
7 1 9249 6309 0.0101879 0.0297420
ggplot(data = data7_hour,
       aes(fill = DIREZIONE,
           x = TIC_HOUR,
           y = TOT_TICs)) +     
  geom_bar(stat = "identity") + 
  theme_minimal() +              
  labs(title="DISTRIBUZIONE ACQUISTI PER ORA")

ggplot(data = data7_hour,
       aes(fill = DIREZIONE,
           x = TIC_HOUR,
           y = TOT_TICs)) +      
  geom_bar(stat = "identity",
           position = "fill" ) + 
  theme_minimal()                

#analisi distribuzione della variabile `DIREZIONE` secondo la variabile `COD_REPARTO` mediante grafici in termini assoluti e percentuali
data7_rep <- data7_complete                                  %>%
                  group_by(COD_REPARTO, DIREZIONE)             %>% 
                  summarise(TOT_TICs = n_distinct(ID_SCONTRINO),
                            TOT_CLIs = n_distinct(ID_CLI))     %>% 
                  left_join(data7_direction                 %>%
                              select(DIREZIONE,
                                     ALL_TOT_TICs = TOT_TICs,
                                     ALL_TOT_CLIs = TOT_CLIs),
                            by = 'DIREZIONE')                  %>% 
                  mutate(PERCENT_TICs = TOT_TICs/ALL_TOT_TICs,
                         PERCENT_CLIs = TOT_CLIs/ALL_TOT_CLIs) %>% 
                    select(-ALL_TOT_TICs, -ALL_TOT_CLIs)
    
knitr::kable(head(data7_rep))
COD_REPARTO DIREZIONE TOT_TICs TOT_CLIs PERCENT_TICs PERCENT_CLIs
1 -1 2512 1928 0.0278526 0.0413539
1 1 65151 30531 0.0717644 0.1439300
2 -1 5604 4598 0.0621362 0.0986230
2 1 94618 54055 0.1042225 0.2548274
3 -1 17083 10835 0.1894133 0.2324010
3 1 229972 93224 0.2533161 0.4394788
ggplot(data = data7_rep,
       aes(fill = DIREZIONE,
           x = COD_REPARTO,
           y = TOT_TICs)) +     
  geom_bar(stat = "identity") + 
  theme_minimal()               

ggplot(data = data7_rep,
       aes(fill = DIREZIONE,
           x = COD_REPARTO,
           y = TOT_TICs)) +      
  geom_bar(stat = "identity",
           position = "fill" ) + 
  theme_minimal()                

#analisi distribuzione della variabile `DIREZIONE` secondo la variabile `TIC_DATE_TYP` mediante grafici in termini assoluti e percentuali
data7_datetyp <- data7_complete                                  %>%
                      group_by(TIC_DATE_TYP, DIREZIONE)            %>% 
                      summarise(TOT_TICs = n_distinct(ID_SCONTRINO),
                                TOT_CLIs = n_distinct(ID_CLI))     %>% 
                      left_join(data7_direction                %>%
                                  select(DIREZIONE,
                                         ALL_TOT_TICs = TOT_TICs,
                                         ALL_TOT_CLIs = TOT_CLIs),
                                by = 'DIREZIONE')                  %>% 
                      mutate(PERCENT_TICs = TOT_TICs/ALL_TOT_TICs,
                             PERCENT_CLIs = TOT_CLIs/ALL_TOT_CLIs) %>% 
                      select(-ALL_TOT_TICs, -ALL_TOT_CLIs)

knitr::kable(data7_datetyp)
TIC_DATE_TYP DIREZIONE TOT_TICs TOT_CLIs PERCENT_TICs PERCENT_CLIs
holiday -1 14522 10759 0.1610174 0.2307709
holiday 1 157868 81742 0.1738929 0.3853501
weekday -1 47294 28981 0.5243877 0.6216164
weekday 1 452679 153338 0.4986297 0.7228696
weekend -1 28373 19565 0.3145949 0.4196517
weekend 1 297299 125137 0.3274773 0.5899238
ggplot(data = data7_datetyp,
       aes(fill = DIREZIONE,
           x = TIC_DATE_TYP,
           y = TOT_TICs)) +    
 geom_bar(stat = "identity") + 
 theme_minimal()              

ggplot(data = data7_datetyp,
       aes(fill = DIREZIONE,
           x = TIC_DATE_TYP,
           y = TOT_TICs)) +      
  geom_bar(stat = "identity",
           position = "fill" ) + 
  theme_minimal()               

#analisi distribuzione della variabile `DIREZIONE` secondo la variabili `IMPORTO_LORDO` e `SCONTO` mediante grafico
data7_importosconto <- data7_complete                      %>%
                            group_by(ID_SCONTRINO, DIREZIONE) %>% 
                            summarise(IMPORTO_LORDO = sum(IMPORTO_LORDO),
                                      SCONTO = sum(SCONTO))   %>% 
                            ungroup()                         %>%
                            as.data.frame()

data7_avgimportosconto <- data7_importosconto %>%
                                group_by(DIREZIONE) %>%                            
                                summarise(AVG_IMPORTO_LORDO = mean(IMPORTO_LORDO), 
                                AVG_SCONTO = mean(SCONTO))


ggplot(data = data7_importosconto %>%
         filter((IMPORTO_LORDO > -1000) & (IMPORTO_LORDO < 1000)), 
       aes(color = DIREZIONE,
           x = IMPORTO_LORDO)) + 
  geom_histogram(binwidth = 10,
                 fill = "white",
                 alpha = 0.5) +  
  theme_minimal()                

ggplot(data = data7_importosconto %>%
         filter((SCONTO > -250) & (IMPORTO_LORDO < 250)), 
       aes(color = DIREZIONE, x = SCONTO)) + 
  geom_histogram(binwidth = 10,
                 fill = "white",
                 alpha = 0.5) +              
  theme_minimal()                           

#analisi distribuzione della variabile `DIREZIONE` secondo la variabili `IMPORTO_LORDO`,`SCONTO` e `COD_REPARTO` mediante grafico
data7_importosconto_cod_rep <- data7_complete %>%
                                    group_by(COD_REPARTO, DIREZIONE) %>% 
                                    summarise(IMPORTO_LORDO = sum(IMPORTO_LORDO),
                                              SCONTO = sum(SCONTO)) %>%  
                                    ungroup() %>%
                                    as.data.frame()

knitr::kable(head(data7_importosconto_cod_rep))
COD_REPARTO DIREZIONE IMPORTO_LORDO SCONTO
1 -1 -157326.2 -11092.76
1 1 3043725.9 145225.84
2 -1 -2016555.4 -173055.40
2 1 30021937.8 2716917.27
3 -1 -681823.8 -45169.53
3 1 8168067.1 348269.70
ggplot(data = data7_importosconto_cod_rep,
       aes(fill = DIREZIONE,
           x = COD_REPARTO,
           y = IMPORTO_LORDO)) + 
 geom_bar(stat = "identity") +   
 theme_minimal()                 

ggplot(data = data7_importosconto_cod_rep,
       aes(fill = DIREZIONE,
           x = COD_REPARTO,
           y = SCONTO)) +     
 geom_bar(stat = "identity") + 
 theme_minimal()               

#analisi distribuzione della variabile `DIREZIONE` secondo la variabile `ID_ARTICOLO`
data7_complete$ID_ARTICOLO <- as.factor(data7_complete$ID_ARTICOLO) 

data7_id_articolo <- data7_complete                                       %>%
                          filter(DIREZIONE == 1)                            %>% 
                          group_by(ID_ARTICOLO)                             %>% 
                          summarise(NUM_VENDITE = n_distinct(ID_SCONTRINO)) %>% 
                          ungroup()                                         %>%
                          as.data.frame()                                   %>%
                          arrange(desc(NUM_VENDITE))  

knitr::kable(head(data7_id_articolo))
ID_ARTICOLO NUM_VENDITE
33700716 57806
33817091 24691
34843564 12804
32882024 6531
34252904 4788
35209202 4758
#analisi distribuzione della variabile `DIREZIONE` secondo la variabile `ID_CLIENTE`
data7_importosconto_id_cli <- data7_complete                %>%
                                    filter(DIREZIONE == 1)          %>%
                                    group_by(ID_CLI)                %>% 
                                    summarise(IMPORTO_LORDO = sum(IMPORTO_LORDO),
                                              SCONTO = sum(SCONTO)) %>% 
                                    ungroup()                       %>%
                                    as.data.frame()                 %>%
                                    arrange(desc(IMPORTO_LORDO))  

knitr::kable(head(data7_importosconto_id_cli))
ID_CLI IMPORTO_LORDO SCONTO
572977 421636.2 40609.00
410777 254098.7 35618.50
562939 202575.7 31966.13
224908 187350.9 18708.09
229720 180023.4 17738.48
96592 179460.3 17913.08
#analisi distribuzione della total purchase
data7_tot_purch <- data7_complete %>%
                        filter(DIREZIONE == 1)                             %>% 
                        group_by(ID_CLI)                                   %>% 
                        summarise(TOT_PURCHASE = n_distinct(ID_SCONTRINO)) %>% 
                        arrange(desc(TOT_PURCHASE))                            

knitr::kable(head(data7_tot_purch))
ID_CLI TOT_PURCHASE
376925 177
117212 155
248975 154
341579 149
99384 147
362177 146
#analisi della total purchse curve mediante grafico

data_next_purchase <- data7_complete %>%
                            filter(DIREZIONE == 1) %>% 
                            select(ID_CLI,
                                   ID_ARTICOLO,
                                   TIC_DATE,
                                   DIREZIONE)      %>% 
                            arrange(ID_CLI)



memory.limit(10000000)
## [1] 1e+07
d7_nextpurchase <- data_next_purchase %>%
  group_by(ID_CLI) %>%
  mutate(Diff = TIC_DATE - lag(TIC_DATE))

x <- as.data.frame(table(d7_nextpurchase$Diff))
x <- x[-1, ]
x$Perc <- x$Freq/sum(x$Freq)


ggplot(x, 
       aes(x = as.numeric(Var1), y = cumsum(Perc))) +
  xlim(0, 100) + 
  geom_line()

Modello RFM

L’analisi Recency, Frequency, Monetary (RFM) è un’analisi di marketing che utilizza appunto queste 3 metriche per segmentare la base dei clienti. La Recency valuta quanto recentemente il client ha fatto un acquisto, la Frequency valuta quanto spesso un cliente fa un acquisto e la Monetary quanti soldi spende il cliente per un acquisto.

Si selezionano i clienti attivi, ovvero quelli che hanno effettuato il loro ultimo acquisto dopo il 01/01/2019:

rfm_periodo_studio <- data7_complete %>%
                      filter(TIC_DATE > as.Date("01/01/2019",
                                                format = "%d/%m/%Y")) 

knitr::kable(head(rfm_periodo_studio))
ID_SCONTRINO ID_CLI ID_NEG ID_ARTICOLO COD_REPARTO DIREZIONE IMPORTO_LORDO SCONTO TIC_DATETIME TIC_HOUR TIC_DATE TIC_DATE_WEEKDAY TIC_DATE_HOLIDAY TIC_DATE_TYP
51119494802/01/19710359211860 669413 39 36146131 6 1 8.9 0.00 2019-01-02 07:10:35 7 2019-01-02 4 FALSE weekday
51119494802/01/19710359211860 669413 39 36505196 10 1 3.6 0.00 2019-01-02 07:10:35 7 2019-01-02 4 FALSE weekday
511115504902/01/19716539383669 754326 39 36472121 7 1 249.0 0.00 2019-01-02 07:16:53 7 2019-01-02 4 FALSE weekday
511115505002/01/19720238594281 358614 39 34584053 1 1 126.0 12.60 2019-01-02 07:20:23 7 2019-01-02 4 FALSE weekday
511115505002/01/19720238594281 358614 39 34584263 1 1 137.5 13.75 2019-01-02 07:20:23 7 2019-01-02 4 FALSE weekday
511115505002/01/19720238594281 358614 39 33370323 11 1 239.6 23.96 2019-01-02 07:20:23 7 2019-01-02 4 FALSE weekday

Recency

#raggruppo per cliente e calcolo la recency
rfm_recency <- rfm_periodo_studio %>%
                  filter(DIREZIONE == 1) %>% 
                  group_by(ID_CLI)       %>% 
                  summarise(LAST_PURCHASE_DATE = max(TIC_DATE))

rfm_recency$RECENCY <- difftime(as.Date("30/04/2019",
                                        format = "%d/%m/%Y"),        
                                     rfm_recency$LAST_PURCHASE_DATE,
                                units = "days")

knitr::kable(head(rfm_recency))
ID_CLI LAST_PURCHASE_DATE RECENCY
23 2019-02-20 69 days
32 2019-04-02 28 days
33 2019-03-02 59 days
48 2019-04-30 0 days
50 2019-02-19 70 days
56 2019-03-11 50 days
#si dividono i clienti in base ai percentili identificando il valore di recency in 3 categorie: 0-25, 25-75, 75-100. Si visualizza la distribuzione mediante grafico.
rfm_recency <- within(rfm_recency,
                 REC_CLASS <- cut(as.numeric(rfm_recency$RECENCY),
                                  breaks = quantile(rfm_recency$RECENCY,
                                                    probs = c(0, .25, .75, 1)), 
                                  include.lowest = T,
                                  labels = c("low", "medium", "high")))         

rec_label <- as.data.frame(table(rfm_recency$REC_CLASS))

ggplot(data = rec_label,
       aes(x = Var1, y = Freq,
           fill = Freq)) +                        
  geom_bar(stat = "identity") +                   
  labs(title = "Recency Distribution",
       x     = "Recency Classes",
       y     = "Total Purchase") +                
  theme_minimal() +                              
  theme(plot.title = element_text(hjust = 0.5)) + 
  scale_x_discrete(labels = c("Low", "Medium", "High")) + 
  guides(fill = FALSE)

Frequency

#calcolo della frequency per cliente
rfm_frequency <- rfm_periodo_studio                                      %>%
                    filter(DIREZIONE == 1)                             %>% 
                    group_by(ID_CLI)                                   %>% 
                    summarise(TOT_PURCHASE = n_distinct(ID_SCONTRINO)) %>% 
                    arrange(desc(TOT_PURCHASE)) 

knitr::kable(head(rfm_frequency))
ID_CLI TOT_PURCHASE
726098 101
756039 92
740391 74
341579 73
3849 71
776800 67
#si dividono i clienti in base ai percentili identificando il valore di frequency in 3 categorie: 0-25, 25-75, 75-100. Si visualizza la distribuzione mediante grafico
rfm_frequency <- within(rfm_frequency,
                   FREQ_CLASS <- cut(rfm_frequency$TOT_PURCHASE,
                                     breaks = c(0, 2, 5, 101),             
                                     include.lowest = T,
                                     right = F,
                                     labels = c("low", "medium", "high"))) 

table(rfm_frequency$FREQ_CLASS)
## 
##    low medium   high 
##  52169  44097  16840
freq_label <- as.data.frame(table(rfm_frequency$FREQ_CLASS))

ggplot(data = freq_label,
       aes(x = Var1, y = Freq,
           fill = Freq)) +                        
  geom_bar(stat = "identity") +                   
  labs(title = "Frequency Distribution",
       x     = "Frequency Classes",
       y     = "Total Purchase") +                
  theme_minimal() +                               
  theme(plot.title = element_text(hjust = 0.5)) + 
  scale_x_discrete(labels = c("Low", "Medium", "High")) + 
  guides(fill = FALSE)

Monetary

#calcolo monetary per ogni cliente
rfm_monetary <- rfm_periodo_studio                           %>%
                  filter(DIREZIONE == 1)                    %>% 
                  group_by(ID_CLI)                          %>% 
                  summarise(IMPORTO_LORDO = sum(IMPORTO_LORDO),
                            SCONTO = sum(SCONTO),
                            SPESA = IMPORTO_LORDO - SCONTO) %>%
                  ungroup()                                 %>%
                  as.data.frame()                           %>%
                  arrange(desc(IMPORTO_LORDO))  

knitr::kable(head(rfm_monetary))
ID_CLI IMPORTO_LORDO SCONTO SPESA
410777 254098.70 35618.50 218480.20
562939 198332.22 31602.10 166730.12
96592 174311.92 17426.00 156885.92
600594 113035.14 11411.11 101624.03
4339 109380.64 15797.16 93583.48
787381 97176.15 9907.49 87268.66
#si dividono i clienti in base ai percentili identificando il valore di monetary in 3 categorie: 0-25, 25-75, 75-100. Si visualizza la distribuzione mediante grafico
rfm_monetary <- within(rfm_monetary,
                   MON_CLASS <- cut(rfm_monetary$SPESA,
                                    breaks = quantile(rfm_monetary$SPESA,
                                                      probs = c(0, .25, .75, 1)), 
                                    include.lowest = T,
                                    labels = c("low", "medium", "high"))) 

table(rfm_monetary$MON_CLASS)
## 
##    low medium   high 
##  28277  56553  28276
mon_label <- as.data.frame(table(rfm_monetary$MON_CLASS))

ggplot(data = mon_label,
       aes(x = Var1, y = Freq,
           fill = Freq)) +                        
  geom_bar(stat = "identity") +                   
  labs(title = "Monetary Distribution",
       x     = "Monetary Classes",
       y     = "Total Amount") +                 
  theme_minimal() +                               
  theme(plot.title = element_text(hjust = 0.5)) + 
  scale_x_discrete(labels = c("Low", "Medium", "High")) + 
  guides(fill = FALSE)

Merge

Creazione del dataset contenente tutte e 3 le metriche

#si uniscono i 3 dataset tramite 2 merge

rfm <- merge(rfm_frequency, 
             rfm_monetary,  
             by = "ID_CLI") 

rfm <- merge(rfm,           
             rfm_recency,   
             by = "ID_CLI")

knitr::kable(head(rfm))
ID_CLI TOT_PURCHASE FREQ_CLASS IMPORTO_LORDO SCONTO SPESA MON_CLASS LAST_PURCHASE_DATE RECENCY REC_CLASS
23 1 low 38.07 0.00 38.07 low 2019-02-20 69 days medium
32 5 high 903.61 0.00 903.61 high 2019-04-02 28 days medium
33 1 low 48.45 0.00 48.45 medium 2019-03-02 59 days medium
48 12 high 2135.95 306.65 1829.30 high 2019-04-30 0 days low
50 2 medium 437.25 62.56 374.69 high 2019-02-19 70 days medium
56 4 medium 586.49 66.30 520.19 high 2019-03-11 50 days medium
#combinazione delle categorie recency e frequency per definire piĂ¹ accuratamente loyalty satus dei clienti

rfm$RF <- NA

for(i in c(1:nrow(rfm))){
  if(rfm$REC_CLASS[i] == "low" && rfm$FREQ_CLASS[i] == "low") rfm$RF[i] <- "One-Timer"
  if(rfm$REC_CLASS[i] == "medium" && rfm$FREQ_CLASS[i] == "low") rfm$RF[i] <- "One-Timer"
  if(rfm$REC_CLASS[i] == "high" && rfm$FREQ_CLASS[i] == "low") rfm$RF[i] <- "Leaving"
  if(rfm$REC_CLASS[i] == "low" && rfm$FREQ_CLASS[i] == "medium") rfm$RF[i] <- "Engaged"
  if(rfm$REC_CLASS[i] == "medium" && rfm$FREQ_CLASS[i] == "medium") rfm$RF[i] <- "Engaged"
  if(rfm$REC_CLASS[i] == "high" && rfm$FREQ_CLASS[i] == "medium") rfm$RF[i] <- "Leaving"
  if(rfm$REC_CLASS[i] == "low" && rfm$FREQ_CLASS[i] == "high") rfm$RF[i] <- "Top"
  if(rfm$REC_CLASS[i] == "medium" && rfm$FREQ_CLASS[i] == "high") rfm$RF[i] <- "Top"
  if(rfm$REC_CLASS[i] == "high" && rfm$FREQ_CLASS[i] == "high") rfm$RF[i] <- "Leaving Top"
}

table(rfm$RF)
## 
##     Engaged     Leaving Leaving Top   One-Timer         Top 
##       36316       27187         592       32763       16248

Creiamo il grafico per la variabile RF:

#creata variabile recency-frequency e analisi della sua distribuzione mediante grafico

rf_data <- as.data.frame(rbind(c("Top",         "High",   "Low",    16248),
                              c("Top",         "High",   "Medium", 16248),
                              c("Leaving Top", "High",   "High",   592),
                              c("Engaged",     "Medium", "Low",    36316),
                              c("Engaged",     "Medium", "Medium", 36316),
                              c("Leaving",     "Medium", "High",   27187),
                              c("One Timer",   "Low",    "Low",    32763),
                              c("One Timer",   "Low",    "Medium", 32763),
                              c("Leaving",     "Low",    "High",   27187))) 

colnames(rf_data) <-  c("Level", "Frequency", "Recency", "Value")
rf_data$Frequency <- factor(rf_data$Frequency,
                          levels = c("High", "Medium", "Low"))
rf_data$Recency <- factor(rf_data$Recency,
                          levels = c("High", "Medium", "Low"))
rf_data$Value <- as.numeric(rf_data$Value)


ggplot(rf_data, aes(x = Frequency, y = Recency, fill = Value)) + 
  geom_tile() +
  geom_text(aes(label = Level)) +
  scale_fill_distiller(palette = "Spectral")+
  theme_minimal()+
  labs(title="TABELLA RF")

rf <- as.data.frame(table(rfm$RF))
rf
##          Var1  Freq
## 1     Engaged 36316
## 2     Leaving 27187
## 3 Leaving Top   592
## 4   One-Timer 32763
## 5         Top 16248
ggplot(data = rf,
       aes(x = Var1, y = Freq,
           fill = Freq)) +                        
  geom_bar(stat = "identity") +                  
  scale_colour_brewer(palette = "RdBu") +
  labs(title = "DISTRIBUZIONE CLASSI RF",
       x     = "RF Classes",
       y     = "Total Clients") +                
  theme_minimal() +                               
  theme(plot.title = element_text(hjust = 0.5)) + 
  scale_x_discrete(labels = c("Engaged", "Leaving", "Leaving Top",
                              "One Timer", "Top")) + 
  guides(fill = FALSE)

Infine, le classi RFM sono ottenute combinando le classi RF con i gruppi di monetary:

#creazione della variabile rfm che combina le classi della variabile recency-frequency con le categorie di monetary

rfm$RFM <- NA

for(i in c(1:nrow(rfm))){
  if(rfm$RF[i] == "One-Timer" && rfm$MON_CLASS[i] == "low") rfm$RFM[i] <- "Cheap"
  if(rfm$RF[i] == "Leaving" && rfm$MON_CLASS[i] == "low") rfm$RFM[i] <- "Tin"
  if(rfm$RF[i] == "Engaged" && rfm$MON_CLASS[i] == "low") rfm$RFM[i] <- "Copper"
  if(rfm$RF[i] == "Leaving Top" && rfm$MON_CLASS[i] == "low") rfm$RFM[i] <- "Bronze"
  if(rfm$RF[i] == "Top" && rfm$MON_CLASS[i] == "low") rfm$RFM[i] <- "Silver"
  
  if(rfm$RF[i] == "One-Timer" && rfm$MON_CLASS[i] == "medium") rfm$RFM[i] <- "Tin"
  if(rfm$RF[i] == "Leaving" && rfm$MON_CLASS[i] == "medium") rfm$RFM[i] <- "Copper"
  if(rfm$RF[i] == "Engaged" && rfm$MON_CLASS[i] == "medium") rfm$RFM[i] <- "Bronze"
  if(rfm$RF[i] == "Leaving Top" && rfm$MON_CLASS[i] == "medium") rfm$RFM[i] <- "Silver"
  if(rfm$RF[i] == "Top" && rfm$MON_CLASS[i] == "medium") rfm$RFM[i] <- "Gold"
  
  if(rfm$RF[i] == "One-Timer" && rfm$MON_CLASS[i] == "high") rfm$RFM[i] <- "Copper"
  if(rfm$RF[i] == "Leaving" && rfm$MON_CLASS[i] == "high") rfm$RFM[i] <- "Bronze"
  if(rfm$RF[i] == "Engaged" && rfm$MON_CLASS[i] == "high") rfm$RFM[i] <- "Silver"
  if(rfm$RF[i] == "Leaving Top" && rfm$MON_CLASS[i] == "high") rfm$RFM[i] <- "Gold"
  if(rfm$RF[i] == "Top" && rfm$MON_CLASS[i] == "high") rfm$RFM[i] <- "Diamond"
}
#analisi della distribuzione della variabile rfm appena creata mediante grafico

rfm_data <- as.data.frame(rbind(c("Top", "High", "Diamond", 10984),
                             c("Top", "Medium", "Gold", 5585),
                             c("Top", "Low", "Silver", 10306),
                             c("Leaving Top", "High", "Gold", 5585),
                             c("Leaving Top", "Medium", "Silver", 10306),
                             c("Leaving Top", "Low", "Bronze", 25932),
                             c("Engaged", "High", "Silver", 10306),
                             c("Engaged", "Medium", "Bronze", 25932),
                             c("Engaged", "Low", "Copper", 20938),
                             c("Leaving", "High", "Bronze", 25932),
                             c("Leaving", "Medium", "Copper", 20938),
                             c("Leaving", "Low", "Tin", 24967),
                             c("One Timer", "High", "Copper", 20938),
                             c("One Timer", "Medium", "Tin", 24967),
                             c("One Timer", "Low", "Cheap", 14394))) 

colnames(rfm_data) <- c("RF", "Monetary", "Level", "Value")
rfm_data$RF <- factor(rfm_data$RF,
                    levels = c("Top", "Leaving Top",
                               "Engaged", "Leaving", "One Timer"))
rfm_data$Monetary <- factor(rfm_data$Monetary,
                          levels = c("Low", "Medium", "High"))
rfm_data$Value <- as.numeric(rfm_data$Value)


ggplot(rfm_data, aes(x = RF, y = Monetary, fill = Value)) + 
  geom_tile() +
  geom_text(aes(label = Level)) +
  scale_fill_distiller(palette = "RdBu") +
  theme_minimal()+
  labs(title="TABELLA RFM")

rfm_plot <- as.data.frame(table(rfm$RFM))

ggplot(data = rfm_plot,
       aes(x = Var1, y = Freq,
           fill = Freq)) +                       
  geom_bar(stat = "identity") +                  
  scale_colour_brewer(palette = "Set1") +
  labs(title = "DISTRIBUZIONE CLASSI RFM",
       x     = "RFM Classes",
       y     = "Total Clients") +                
  theme_minimal() +                               
  theme(plot.title = element_text(hjust = 0.5)) + 
  scale_x_discrete(labels = c("Bronze", "Cheap", "Copper", "Diamond",
                              "Gold", "Silver", "Tin")) + 
  guides(fill = FALSE)

Churn

Churn rate misura il tasso di clienti che sono già abbonati per un prodotto o un servizio e disdicono l’abbonamento entro un certo lasso di tempo.

#si seleziona il periodo dal 01/10/2018 al 01/01/2019 come data di riferimento
churn_periodo_studio <- data7_complete %>%
                        filter(DIREZIONE == 1,
                               TIC_DATE < as.Date("1/1/2019",
                                                  format = "%d/%m/%Y"),
                               TIC_DATE > as.Date("01/10/2018",
                                                  format = "%d/%m/%Y"))

knitr::kable(head(churn_periodo_studio))
ID_SCONTRINO ID_CLI ID_NEG ID_ARTICOLO COD_REPARTO DIREZIONE IMPORTO_LORDO SCONTO TIC_DATETIME TIC_HOUR TIC_DATE TIC_DATE_WEEKDAY TIC_DATE_HOLIDAY TIC_DATE_TYP
513715359302/10/18708198952096 538298 15 48020112 2 1 224.14 0.00 2018-10-02 07:08:19 7 2018-10-02 3 FALSE weekday
512310497402/10/18710039040884 585553 29 35554932 1 1 90.10 16.15 2018-10-02 07:10:03 7 2018-10-02 3 FALSE weekday
512310497402/10/18710039040884 585553 29 81256071 1 1 59.13 15.93 2018-10-02 07:10:03 7 2018-10-02 3 FALSE weekday
51521089902/10/18711052729620 35501 4 36137402 7 1 50.00 5.00 2018-10-02 07:11:05 7 2018-10-02 3 FALSE weekday
51521089902/10/18711052729620 35501 4 36137472 7 1 130.00 13.00 2018-10-02 07:11:05 7 2018-10-02 3 FALSE weekday
51521089902/10/18711052729620 35501 4 36137605 7 1 130.00 13.00 2018-10-02 07:11:05 7 2018-10-02 3 FALSE weekday
#si seleziona il periodo di holdout: 01/01/2019 - 28/02/2019
churn_holdout <- data7_complete %>%
                  filter(DIREZIONE == 1,
                         TIC_DATE < as.Date("28/02/2019",
                                            format = "%d/%m/%Y"),
                         TIC_DATE > as.Date("01/01/2019",
                                            format = "%d/%m/%Y"))

no_churner <- unique(churn_holdout$ID_CLI)

knitr::kable(head(churn_holdout))
ID_SCONTRINO ID_CLI ID_NEG ID_ARTICOLO COD_REPARTO DIREZIONE IMPORTO_LORDO SCONTO TIC_DATETIME TIC_HOUR TIC_DATE TIC_DATE_WEEKDAY TIC_DATE_HOLIDAY TIC_DATE_TYP
51119494802/01/19710359211860 669413 39 36146131 6 1 8.9 0.00 2019-01-02 07:10:35 7 2019-01-02 4 FALSE weekday
51119494802/01/19710359211860 669413 39 36505196 10 1 3.6 0.00 2019-01-02 07:10:35 7 2019-01-02 4 FALSE weekday
511115504902/01/19716539383669 754326 39 36472121 7 1 249.0 0.00 2019-01-02 07:16:53 7 2019-01-02 4 FALSE weekday
511115505002/01/19720238594281 358614 39 34584053 1 1 126.0 12.60 2019-01-02 07:20:23 7 2019-01-02 4 FALSE weekday
511115505002/01/19720238594281 358614 39 34584263 1 1 137.5 13.75 2019-01-02 07:20:23 7 2019-01-02 4 FALSE weekday
511115505002/01/19720238594281 358614 39 33370323 11 1 239.6 23.96 2019-01-02 07:20:23 7 2019-01-02 4 FALSE weekday

Successivamente, si sceglie la durata del periodo di lookback prima della data di riferimento:

#selezione del periodo di lookback: 3 mesi

churn_recency <- churn_periodo_studio %>%
                  filter(DIREZIONE == 1) %>%
                  group_by(ID_CLI) %>%
                  summarise(LAST_PURCHASE_DATE = max(TIC_DATE))

churn_recency$RECENCY <- difftime(as.Date("01/01/2019",
                                          format = "%d/%m/%Y"),          
                                  churn_recency$LAST_PURCHASE_DATE,
                                  units = "days")


churn_frequency <- churn_periodo_studio %>%
                    filter(DIREZIONE == 1) %>%
                    group_by(ID_CLI) %>%
                    summarise(TOT_PURCHASE = n_distinct(ID_SCONTRINO)) %>%
                    arrange(desc(TOT_PURCHASE))

 
churn_monetary <- churn_periodo_studio %>%
                    filter(DIREZIONE == 1) %>%
                    group_by(ID_CLI) %>%
                    summarise(IMPORTO_LORDO = sum(IMPORTO_LORDO),
                              SCONTO = sum(SCONTO),
                              SPESA = IMPORTO_LORDO - SCONTO) %>%
                    ungroup() %>%
                    as.data.frame() %>%
                    arrange(desc(IMPORTO_LORDO))

churn <- merge(churn_recency, churn_frequency, by = "ID_CLI") #-- Merge finale
churn <- merge(churn, churn_monetary, by = "ID_CLI") %>%
          select(ID_CLI,
                 RECENCY,
                 SPESA, 
                 TOT_PURCHASE)

knitr::kable(head(churn))
ID_CLI RECENCY SPESA TOT_PURCHASE
5 39 days 188.28 1
18 39 days 50.69 1
23 11 days 1736.88 8
28 82 days 1864.48 1
48 3 days 2047.45 11
58 5 days 39.94 1
#creazione della variabile target churn: 1 = churned nel periodo di holdout, 0 = altrimenti. Non si puĂ² utilizzare la relazione con la next purchase curve, perchè in data7_complete uno stesso cliente viene riconsiderato nel momento in cui ritorna all'acquisto, mentre in questo caso viene considerato una sola volta.
churn$CHURN <- 1

for (i in c(1:nrow(churn))){
  if (churn$ID_CLI[i] %in% no_churner) churn$CHURN[i] <- 0
}

churn$CHURN <- as.factor(churn$CHURN)

table(churn$CHURN)
## 
##     0     1 
## 35373 59752
#si selezionano i predittori: RECENCY, SPESA, TOT_PURCHASE, REGION, LAST_COD_FID e TYP_JOB
churn <- left_join(churn, data2_complete[, c("ID_CLI", "TYP_JOB")], by = "ID_CLI")  

churn <- left_join(churn, data1_complete[, c("ID_CLI", "LAST_COD_FID")], by = "ID_CLI")  

regione <- left_join(data2_complete[, c("ID_CLI", "ID_ADDRESS")],
                    data3_complete[, c("ID_ADDRESS", "REGION")], by = "ID_ADDRESS") 

churn <- left_join(churn, regione, by = "ID_CLI") 
churn <- churn[, -8]

knitr::kable(head(churn))
ID_CLI RECENCY SPESA TOT_PURCHASE CHURN TYP_JOB LAST_COD_FID REGION
5 39 days 188.28 1 1 (missing) STANDARD BIZ LOMBARDIA
18 39 days 50.69 1 1 (missing) STANDARD LOMBARDIA
23 11 days 1736.88 8 0 (missing) STANDARD LOMBARDIA
28 82 days 1864.48 1 1 (missing) PREMIUM BIZ LOMBARDIA
48 3 days 2047.45 11 0 Libero professionista PREMIUM LOMBARDIA
58 5 days 39.94 1 1 (missing) STANDARD LOMBARDIA

Modelli

Si utilizzano dei modelli per cercare di prevedere se i clienti saranno o meno dei churn. Si provano 4 modelli: Tree, Random forest, Logistic regression e Lasso

#creazione di train (70%) e test (30%) necessario per addestrare e testare i modelli. Si evidenzia che la variabile target non risulta essere sbilanciata
churn <- na.omit(churn)

train_p <- createDataPartition(churn$CHURN, 
                                   p = .70, 
                                   list = FALSE, 
                                   times = 1)


train <- churn[train_p,]
test <- churn[-train_p,]

table(train$CHURN)
## 
##     0     1 
## 23972 39886

Tree

#Tree, la variabile TOT_PURCHASE è la variabile piĂ¹ significativa
tree <- rpart(CHURN ~ RECENCY + SPESA + TOT_PURCHASE + REGION + TYP_JOB,
              data = train)

rpart.plot(tree, extra = "auto")

summary(tree)
## Call:
## rpart(formula = CHURN ~ RECENCY + SPESA + TOT_PURCHASE + REGION + 
##     TYP_JOB, data = train)
##   n= 63858 
## 
##         CP nsplit rel error    xerror        xstd
## 1 0.158268      0  1.000000 1.0000000 0.005104468
## 2 0.010000      1  0.841732 0.8542883 0.004920194
## 
## Variable importance
## TOT_PURCHASE        SPESA 
##           89           11 
## 
## Node number 1: 63858 observations,    complexity param=0.158268
##   predicted class=1  expected loss=0.3753954  P(node) =1
##     class counts: 23972 39886
##    probabilities: 0.375 0.625 
##   left son=2 (13440 obs) right son=3 (50418 obs)
##   Primary splits:
##       TOT_PURCHASE < 3.5     to the right, improve=2404.40100, (0 missing)
##       RECENCY      < 25.5    to the left,  improve=1783.54700, (0 missing)
##       SPESA        < 379.805 to the right, improve= 481.01080, (0 missing)
##       TYP_JOB      splits as  LLLLLLRRLRLLLLR, improve=  70.30463, (0 missing)
##       REGION       splits as  LRRLLLLLLRRLLRLLRLRL, improve=  66.00161, (0 missing)
##   Surrogate splits:
##       SPESA   < 970.53  to the right, agree=0.815, adj=0.12, (0 split)
##       RECENCY < 1.5     to the left,  agree=0.790, adj=0.00, (0 split)
##       TYP_JOB splits as  RRRRRRRRRRRRLRR, agree=0.790, adj=0.00, (0 split)
## 
## Node number 2: 13440 observations
##   predicted class=0  expected loss=0.3588542  P(node) =0.210467
##     class counts:  8617  4823
##    probabilities: 0.641 0.359 
## 
## Node number 3: 50418 observations
##   predicted class=1  expected loss=0.3045539  P(node) =0.789533
##     class counts: 15355 35063
##    probabilities: 0.305 0.695
printcp(tree) 
## 
## Classification tree:
## rpart(formula = CHURN ~ RECENCY + SPESA + TOT_PURCHASE + REGION + 
##     TYP_JOB, data = train)
## 
## Variables actually used in tree construction:
## [1] TOT_PURCHASE
## 
## Root node error: 23972/63858 = 0.3754
## 
## n= 63858 
## 
##        CP nsplit rel error  xerror      xstd
## 1 0.15827      0   1.00000 1.00000 0.0051045
## 2 0.01000      1   0.84173 0.85429 0.0049202

Random forest

memory.limit(10000000)
## [1] 1e+07
tree_rf <- randomForest::randomForest(CHURN ~ RECENCY + SPESA + TOT_PURCHASE +
                                        REGION + TYP_JOB,
                                      data = train, ntree = 100)
print(tree_rf)
## 
## Call:
##  randomForest(formula = CHURN ~ RECENCY + SPESA + TOT_PURCHASE +      REGION + TYP_JOB, data = train, ntree = 100) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 31.68%
## Confusion matrix:
##      0     1 class.error
## 0 9511 14461   0.6032455
## 1 5772 34114   0.1447124

Logistic regression

#RECENCY e TOT_PURCHASE sono le variabili piĂ¹ significative
logistic = train(CHURN ~ RECENCY + SPESA + TOT_PURCHASE +
                  REGION + TYP_JOB,
                 data = train,
                 method = "glm")

summary(logistic)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4368  -1.1621   0.6800   0.9144   4.4599  
## 
## Coefficients:
##                                              Estimate Std. Error z value
## (Intercept)                                 3.655e-01  1.632e-01   2.240
## RECENCY                                     1.635e-02  3.896e-04  41.967
## SPESA                                       1.231e-05  6.690e-06   1.839
## TOT_PURCHASE                               -2.210e-01  4.437e-03 -49.814
## REGIONBASILICATA                            7.222e-01  1.960e-01   3.686
## REGIONCALABRIA                              1.072e+00  2.424e-01   4.420
## REGIONCAMPANIA                              1.027e-01  6.574e-02   1.563
## `REGIONEMILIA ROMAGNA`                      4.751e-02  6.659e-02   0.714
## `REGIONFRIULI VENEZIA GIULIA`               1.205e-01  9.103e-02   1.324
## REGIONLAZIO                                 8.435e-02  6.399e-02   1.318
## REGIONLIGURIA                               1.212e-01  8.085e-02   1.499
## REGIONLOMBARDIA                             3.534e-02  6.027e-02   0.586
## REGIONMARCHE                                1.263e+00  2.096e-01   6.023
## REGIONMOLISE                                7.474e-01  3.154e-01   2.370
## REGIONPIEMONTE                              7.428e-02  6.612e-02   1.124
## REGIONPUGLIA                                1.648e-01  6.836e-02   2.411
## REGIONSARDEGNA                              1.073e+00  2.923e-01   3.672
## REGIONSICILIA                               1.597e-01  6.727e-02   2.374
## REGIONTOSCANA                               3.023e-01  7.417e-02   4.076
## `REGIONTRENTINO ALTO ADIGE`                 1.079e+00  2.811e-01   3.837
## REGIONUMBRIA                                1.246e-01  8.844e-02   1.409
## `REGIONVALLE D'AOSTA`                       5.428e-01  3.883e-01   1.398
## REGIONVENETO                                8.153e-02  6.447e-02   1.265
## TYP_JOBArtigiano                           -4.589e-01  3.869e-01  -1.186
## TYP_JOBCasalinga                           -1.347e-01  4.180e-01  -0.322
## `TYP_JOBCommerciante/Esercente`            -1.084e+00  4.726e-01  -2.293
## `TYP_JOBDirigente/Quadro/Funzionario`      -7.144e-02  2.735e-01  -0.261
## `TYP_JOBImpiegato/a`                       -3.778e-01  1.809e-01  -2.088
## TYP_JOBImprenditore                         2.209e-01  4.329e-01   0.510
## `TYP_JOBIn cerca di occupazione`           -4.507e-02  5.713e-01  -0.079
## `TYP_JOBLibero professionista`             -2.209e-01  1.659e-01  -1.331
## `TYP_JOBNon Dichiara`                       4.109e-01  3.339e-01   1.231
## `TYP_JOBOperaio/a`                         -9.171e-01  2.518e-01  -3.642
## `TYP_JOBPensionato/a`                      -2.235e-01  2.321e-01  -0.963
## `TYP_JOBRappresentante/Agente Commerciale` -1.166e+00  1.331e+00  -0.876
## TYP_JOBStudente                            -2.859e-01  7.189e-01  -0.398
## `TYP_JOB(missing)`                          6.472e-02  1.521e-01   0.426
##                                            Pr(>|z|)    
## (Intercept)                                0.025113 *  
## RECENCY                                     < 2e-16 ***
## SPESA                                      0.065863 .  
## TOT_PURCHASE                                < 2e-16 ***
## REGIONBASILICATA                           0.000228 ***
## REGIONCALABRIA                             9.87e-06 ***
## REGIONCAMPANIA                             0.118116    
## `REGIONEMILIA ROMAGNA`                     0.475533    
## `REGIONFRIULI VENEZIA GIULIA`              0.185564    
## REGIONLAZIO                                0.187424    
## REGIONLIGURIA                              0.133850    
## REGIONLOMBARDIA                            0.557597    
## REGIONMARCHE                               1.71e-09 ***
## REGIONMOLISE                               0.017791 *  
## REGIONPIEMONTE                             0.261223    
## REGIONPUGLIA                               0.015926 *  
## REGIONSARDEGNA                             0.000241 ***
## REGIONSICILIA                              0.017620 *  
## REGIONTOSCANA                              4.59e-05 ***
## `REGIONTRENTINO ALTO ADIGE`                0.000124 ***
## REGIONUMBRIA                               0.158699    
## `REGIONVALLE D'AOSTA`                      0.162092    
## REGIONVENETO                               0.206004    
## TYP_JOBArtigiano                           0.235549    
## TYP_JOBCasalinga                           0.747294    
## `TYP_JOBCommerciante/Esercente`            0.021854 *  
## `TYP_JOBDirigente/Quadro/Funzionario`      0.793911    
## `TYP_JOBImpiegato/a`                       0.036772 *  
## TYP_JOBImprenditore                        0.609903    
## `TYP_JOBIn cerca di occupazione`           0.937122    
## `TYP_JOBLibero professionista`             0.183101    
## `TYP_JOBNon Dichiara`                      0.218442    
## `TYP_JOBOperaio/a`                         0.000270 ***
## `TYP_JOBPensionato/a`                      0.335605    
## `TYP_JOBRappresentante/Agente Commerciale` 0.381019    
## TYP_JOBStudente                            0.690867    
## `TYP_JOB(missing)`                         0.670439    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 84518  on 63857  degrees of freedom
## Residual deviance: 75926  on 63821  degrees of freedom
## AIC: 76000
## 
## Number of Fisher Scoring iterations: 4

Lasso

#Lasso
lasso = train(CHURN ~ RECENCY + SPESA + TOT_PURCHASE +
                REGION + TYP_JOB,
            data = train,
            method = "glmnet",
            family ="binomial")

lasso
## glmnet 
## 
## 63858 samples
##     5 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 63858, 63858, 63858, 63858, 63858, 63858, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda        Accuracy   Kappa    
##   0.10   0.0002840866  0.6932920  0.2747197
##   0.10   0.0028408662  0.6932988  0.2746677
##   0.10   0.0284086619  0.6919673  0.2634784
##   0.55   0.0002840866  0.6934521  0.2754803
##   0.55   0.0028408662  0.6934877  0.2745276
##   0.55   0.0284086619  0.6891087  0.2490805
##   1.00   0.0002840866  0.6934385  0.2755592
##   1.00   0.0028408662  0.6934211  0.2738142
##   1.00   0.0284086619  0.6865659  0.2367597
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 0.55 and lambda = 0.002840866.
plot(lasso)

Analisi delle prediction per ogni modello

#si valuta accuracy e tutte le matriche di predizione dei modelli che sono stati addestrati
pred_tree <- predict(tree, test[,-5], type = "class")
p1 <- unlist(pred_tree)
confusionMatrix(p1, test$CHURN)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0  3675  2154
##          1  6598 14939
##                                           
##                Accuracy : 0.6802          
##                  95% CI : (0.6746, 0.6857)
##     No Information Rate : 0.6246          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.2536          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.3577          
##             Specificity : 0.8740          
##          Pos Pred Value : 0.6305          
##          Neg Pred Value : 0.6936          
##              Prevalence : 0.3754          
##          Detection Rate : 0.1343          
##    Detection Prevalence : 0.2130          
##       Balanced Accuracy : 0.6159          
##                                           
##        'Positive' Class : 0               
## 
pred_rf <- predict(tree_rf, test[,-5], type = "class")
confusionMatrix(pred_rf, test$CHURN)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0  4031  2423
##          1  6242 14670
##                                           
##                Accuracy : 0.6834          
##                  95% CI : (0.6778, 0.6889)
##     No Information Rate : 0.6246          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.2707          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.3924          
##             Specificity : 0.8582          
##          Pos Pred Value : 0.6246          
##          Neg Pred Value : 0.7015          
##              Prevalence : 0.3754          
##          Detection Rate : 0.1473          
##    Detection Prevalence : 0.2358          
##       Balanced Accuracy : 0.6253          
##                                           
##        'Positive' Class : 0               
## 
pred_log <- predict(logistic, test[,-5], type= "raw")
confusionMatrix(pred_log, test$CHURN)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0  3523  1709
##          1  6750 15384
##                                           
##                Accuracy : 0.6909          
##                  95% CI : (0.6854, 0.6964)
##     No Information Rate : 0.6246          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.2693          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.3429          
##             Specificity : 0.9000          
##          Pos Pred Value : 0.6734          
##          Neg Pred Value : 0.6950          
##              Prevalence : 0.3754          
##          Detection Rate : 0.1287          
##    Detection Prevalence : 0.1912          
##       Balanced Accuracy : 0.6215          
##                                           
##        'Positive' Class : 0               
## 
pred_lasso <- predict(lasso, test[,-5], type = "raw")
confusionMatrix(pred_lasso, test$CHURN)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0  3452  1657
##          1  6821 15436
##                                           
##                Accuracy : 0.6902          
##                  95% CI : (0.6847, 0.6957)
##     No Information Rate : 0.6246          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.2657          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.3360          
##             Specificity : 0.9031          
##          Pos Pred Value : 0.6757          
##          Neg Pred Value : 0.6935          
##              Prevalence : 0.3754          
##          Detection Rate : 0.1261          
##    Detection Prevalence : 0.1867          
##       Balanced Accuracy : 0.6195          
##                                           
##        'Positive' Class : 0               
## 

Calcolo di curve gain e lift per ogni modello

#Calcolo probabilitĂ  di curve gain e lift e rappresentazione mediante grafici, tree modello peggiore rispetto ai 4
prob_tree = predict(tree, test[,-5], "prob")[,1]
prob_rf = predict(tree_rf, test[,-5], "prob")[,1]
prob_log = predict(logistic, test[,-5], "prob")[,1]
prob_lasso = predict(lasso, test[,-5], "prob")[,1]

d_class = as.data.frame(cbind(prob_tree, prob_rf, prob_log, prob_lasso))
d_class = cbind(d_class, test$CHURN)
colnames(d_class) <- c("prob_tree", "prob_rf", "prob_log", "prob_lasso", "churn")
head(d_class)
##    prob_tree prob_rf  prob_log prob_lasso churn
## 11 0.3045539    0.18 0.3513166  0.3496427     1
## 12 0.3045539    0.13 0.2001193  0.2013677     1
## 15 0.3045539    0.55 0.3534518  0.3615758     1
## 16 0.3045539    0.07 0.3240113  0.3323843     1
## 17 0.3045539    0.04 0.2374554  0.2391976     1
## 23 0.3045539    0.07 0.2670007  0.2753297     0
lift_tree = gain_lift(data = d_class, score = 'prob_tree', target = 'churn')

##    Population   Gain Lift Score.Point
## 1          10  35.77 3.58   0.6411458
## 2          20  35.77 1.79   0.6411458
## 3          30 100.00 3.33   0.3045539
## 4          40 100.00 2.50   0.3045539
## 5          50 100.00 2.00   0.3045539
## 6          60 100.00 1.67   0.3045539
## 7          70 100.00 1.43   0.3045539
## 8          80 100.00 1.25   0.3045539
## 9          90 100.00 1.11   0.3045539
## 10        100 100.00 1.00   0.3045539
lift_rf = gain_lift(data = d_class, score = 'prob_rf', target = 'churn')

##    Population   Gain Lift Score.Point
## 1          10  19.74 1.97        0.74
## 2          20  34.70 1.74        0.56
## 3          30  47.34 1.58        0.41
## 4          40  57.31 1.43        0.29
## 5          50  67.30 1.35        0.20
## 6          60  75.56 1.26        0.14
## 7          70  82.35 1.18        0.10
## 8          80  90.12 1.13        0.06
## 9          90  95.82 1.06        0.03
## 10        100 100.00 1.00        0.00
lift_log = gain_lift(data = d_class, score = 'prob_log', target = 'churn')

##    Population   Gain Lift Score.Point
## 1          10  19.90 1.99  0.61693588
## 2          20  35.38 1.77  0.49056291
## 3          30  48.51 1.62  0.42561676
## 4          40  59.01 1.48  0.38361604
## 5          50  68.12 1.36  0.34270890
## 6          60  75.62 1.26  0.30109781
## 7          70  83.00 1.19  0.26437609
## 8          80  89.28 1.12  0.22403495
## 9          90  94.84 1.05  0.18236461
## 10        100 100.00 1.00  0.04927497
lift_lasso = gain_lift(data = d_class, score = 'prob_lasso', target = 'churn')

##    Population   Gain Lift Score.Point
## 1          10  19.84 1.98  0.61034711
## 2          20  35.40 1.77  0.48726157
## 3          30  48.53 1.62  0.42384815
## 4          40  59.06 1.48  0.38365000
## 5          50  68.08 1.36  0.34396966
## 6          60  75.76 1.26  0.30359914
## 7          70  82.98 1.19  0.26614535
## 8          80  89.21 1.12  0.22648916
## 9          90  94.87 1.05  0.18637853
## 10        100 100.00 1.00  0.06264479

Scoring

Risultati dei modelli su clienti di cui non si conosce ancora se saranno churn o meno

#si applicano tutte i passaggi effettuati in precedenza con la differenza che la data di riferimento sarĂ  il 01/01/2019 ed il periodo di scoring andrĂ  dal 01/03/2019 al 30/04/2019
churn_periodo_studio_pred <- data7_complete %>%
                        filter(DIREZIONE == 1,
                               TIC_DATE < as.Date("30/04/2019",
                                                  format = "%d/%m/%Y"),
                               TIC_DATE > as.Date("01/03/2019",
                                                  format = "%d/%m/%Y"))


churn_recency_pred <- churn_periodo_studio_pred %>%
                  filter(DIREZIONE == 1) %>%
                  group_by(ID_CLI) %>%
                  summarise(LAST_PURCHASE_DATE = max(TIC_DATE))

churn_recency_pred$RECENCY <- difftime(as.Date("30/04/2019",
                                          format = "%d/%m/%Y"),          
                                  churn_recency_pred$LAST_PURCHASE_DATE,
                                  units = "days")


churn_frequency_pred <- churn_periodo_studio_pred %>%
                    filter(DIREZIONE == 1) %>%
                    group_by(ID_CLI) %>%
                    summarise(TOT_PURCHASE = n_distinct(ID_SCONTRINO)) %>% 
                    arrange(desc(TOT_PURCHASE))

 
churn_monetary_pred <-churn_periodo_studio_pred %>%
                    filter(DIREZIONE == 1) %>%
                    group_by(ID_CLI) %>%
                    summarise(IMPORTO_LORDO = sum(IMPORTO_LORDO),
                              SCONTO = sum(SCONTO),
                              SPESA = IMPORTO_LORDO - SCONTO) %>% 
                    ungroup() %>%
                    as.data.frame() %>%
                    arrange(desc(IMPORTO_LORDO))

churn_pred <- merge(churn_recency_pred, churn_frequency_pred, by = "ID_CLI")
churn_pred <- merge(churn_pred, churn_monetary_pred, by = "ID_CLI") %>%
          select(ID_CLI,
                 RECENCY,
                 SPESA, 
                 TOT_PURCHASE)  

knitr::kable(head(churn_pred))
ID_CLI RECENCY SPESA TOT_PURCHASE
32 28 days 903.61 5
33 59 days 48.45 1
48 4 days 1350.99 9
56 50 days 38.74 1
76 53 days 224.85 1
77 3 days 1200.04 8
churn_pred <- left_join(churn_pred, data2_complete[, c("ID_CLI", "TYP_JOB")], by = "ID_CLI")  

churn_pred <- left_join(churn_pred, data1_complete[, c("ID_CLI", "LAST_COD_FID")], by = "ID_CLI") 

regione <- left_join(data2_complete[, c("ID_CLI", "ID_ADDRESS")],
                    data3_complete[, c("ID_ADDRESS", "REGION")], by = "ID_ADDRESS") 

churn_pred <- left_join(churn_pred, regione, by = "ID_CLI") 
churn_pred <- churn_pred[, -7]

knitr::kable(head(churn_pred))
ID_CLI RECENCY SPESA TOT_PURCHASE TYP_JOB LAST_COD_FID REGION
32 28 days 903.61 5 (missing) PREMIUM BIZ LOMBARDIA
33 59 days 48.45 1 (missing) PREMIUM LOMBARDIA
48 4 days 1350.99 9 Libero professionista PREMIUM LOMBARDIA
56 50 days 38.74 1 (missing) STANDARD BIZ LOMBARDIA
76 53 days 224.85 1 (missing) STANDARD MARCHE
77 3 days 1200.04 8 (missing) PREMIUM LOMBARDIA
churn_pred <- na.omit(churn_pred)

churn_pred$prob_to_churn <- predict(logistic, churn_pred, type = "prob")[,2]
 

knitr::kable(head(churn_pred))
ID_CLI RECENCY SPESA TOT_PURCHASE TYP_JOB LAST_COD_FID REGION prob_to_churn
32 28 days 903.61 5 (missing) PREMIUM BIZ LOMBARDIA 0.4574334
33 59 days 48.45 1 (missing) PREMIUM LOMBARDIA 0.7702695
48 4 days 1350.99 9 Libero professionista PREMIUM LOMBARDIA 0.1509283
56 50 days 38.74 1 (missing) STANDARD BIZ LOMBARDIA 0.7431783
76 53 days 224.85 1 (missing) STANDARD MARCHE 0.9122222
77 3 days 1200.04 8 (missing) PREMIUM LOMBARDIA 0.2246235